From c344d007dd79c04b2a1a3b7d43a71825b28450b7 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 20:32:49 -0500 Subject: [PATCH] [c++, r] Update and refactor nanoarrow (#2188) * Update nanoarrow vendored files to nanoarrow 0.4.0 * Low-level wiring of nanoarrow at sr_* level * More lower-level wiring of nanoarrow * WIP snapshot with nanoarrow wired into libtiledbsoma * Ensure nullable is set correctly in either case * Context wrapped in a special purpose struct should not finalize * Simpler and faster r-ci.yaml * Use nanoarrow 0.4.0 consistently * Refined arrow_adapter * Set increased timeout for download.file to survive GH flakyness * Turn trace back of, do not include carrow in cli * Do not include carrow.h in reindexer.cc * WIP changes expanding type map, suppressing schema release * [c++] Fix segfault issues * Add additional necessary strdup * No longer to protect one statement * Support TILEDB_DATETIME_DAY aka Date as well * Meh * Meh with version 14.0.0 and not 14.0.6 because ... sure * Remove initialization setters covered by nanoarrow use * Ensure DATETIME columns get Arrow coltype reset * Add more date and datetime support * Additional conversion * Post-rebase change * Heeding time to the lord of linting is time well spent some say * Heeding time to the lord of linting is time well spent some say * Correct another delete to free * Additional non-nullptr protection * make format * Additional test conditioner * Correcting one buffer size selection * make format * Remove carrow.h and reference to it * Cleanups * Use nanoarrow.{c,hpp} via tiledbsoma/utils/ * Re-activate -Werror * Chore * High-productivity afternoon * Correct an format string error message --------- Co-authored-by: Vivian Nguyen --- .github/workflows/r-ci.yml | 8 - apis/python/src/tiledbsoma/reindexer.cc | 2 +- apis/r/DESCRIPTION | 6 +- apis/r/NAMESPACE | 1 + apis/r/R/RcppExports.R | 6 + apis/r/R/utils-arrow.R | 11 +- apis/r/R/utils-readerTransformers.R | 7 +- apis/r/inst/include/tiledbsoma_types.h | 2 +- apis/r/src/RcppExports.cpp | 15 +- apis/r/src/nanoarrow.h | 3097 ------------- apis/r/src/rinterface.cpp | 78 +- apis/r/src/riterator.cpp | 91 +- apis/r/src/rutilities.cpp | 10 +- apis/r/src/rutilities.h | 10 + apis/r/tests/testthat/helper-test-data.R | 17 +- apis/r/tests/testthat/test-Factory.R | 4 +- .../testthat/test-SOMAArrayReader-Iterated.R | 10 +- apis/r/tests/testthat/test-SOMADataFrame.R | 3 +- apis/r/tools/r-ci.sh | 4 +- libtiledbsoma/src/CMakeLists.txt | 35 +- libtiledbsoma/src/cli/cli.cc | 1 - libtiledbsoma/src/utils/arrow_adapter.cc | 340 +- libtiledbsoma/src/utils/arrow_adapter.h | 7 +- libtiledbsoma/src/utils/carrow.h | 50 - .../src/utils}/nanoarrow.c | 1014 +++-- libtiledbsoma/src/utils/nanoarrow.h | 3930 +++++++++++++++++ libtiledbsoma/src/utils/nanoarrow.hpp | 501 +++ 27 files changed, 5613 insertions(+), 3647 deletions(-) delete mode 100644 apis/r/src/nanoarrow.h delete mode 100644 libtiledbsoma/src/utils/carrow.h rename {apis/r/src => libtiledbsoma/src/utils}/nanoarrow.c (74%) create mode 100644 libtiledbsoma/src/utils/nanoarrow.h create mode 100644 libtiledbsoma/src/utils/nanoarrow.hpp diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index a1cd3beb50..7d4b9b4878 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -79,14 +79,6 @@ jobs: # if: ${{ matrix.os != 'macOS-latest' }} # run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - - name: Install r-universe build of SeuratObject (macOS) - if: ${{ matrix.os == 'macOS-latest' }} - run: cd apis/r && Rscript -e "install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev', 'https://cloud.r-project.org'))" - - - name: Install r-universe build of SeuratObject (linux) - if: ${{ matrix.os == 'ubuntu-latest' }} - run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - - name: Dependencies run: cd apis/r && tools/r-ci.sh install_all diff --git a/apis/python/src/tiledbsoma/reindexer.cc b/apis/python/src/tiledbsoma/reindexer.cc index 025325a73e..bbfa035658 100644 --- a/apis/python/src/tiledbsoma/reindexer.cc +++ b/apis/python/src/tiledbsoma/reindexer.cc @@ -31,7 +31,7 @@ */ #include -#include +// #include #include "common.h" #define DENUM(x) .value(#x, TILEDB_##x) diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index c1db979eb1..98a2167516 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -45,11 +45,13 @@ Imports: spdl, rlang, tools, - tibble + tibble, + nanoarrow LinkingTo: Rcpp, RcppSpdlog, - RcppInt64 + RcppInt64, + nanoarrow Additional_repositories: https://ghrr.github.io/drat Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE index 2607cb7deb..7faba888a5 100644 --- a/apis/r/NAMESPACE +++ b/apis/r/NAMESPACE @@ -76,6 +76,7 @@ export(tiledbsoma_stats_show) export(write_soma) import(R6) import(methods) +import(nanoarrow) import(utils) importFrom(Matrix,as.matrix) importFrom(Matrix,sparseMatrix) diff --git a/apis/r/R/RcppExports.R b/apis/r/R/RcppExports.R index 6612d43103..677969b4f6 100644 --- a/apis/r/R/RcppExports.R +++ b/apis/r/R/RcppExports.R @@ -92,6 +92,12 @@ sr_complete <- function(sr) { .Call(`_tiledbsoma_sr_complete`, sr) } +#' @noRd +#' @import nanoarrow +create_empty_arrow_table <- function() { + .Call(`_tiledbsoma_create_empty_arrow_table`) +} + sr_next <- function(sr) { .Call(`_tiledbsoma_sr_next`, sr) } diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R index 00d365caa0..55832d226e 100644 --- a/apis/r/R/utils-arrow.R +++ b/apis/r/R/utils-arrow.R @@ -66,12 +66,14 @@ tiledb_type_from_arrow_type <- function(x, is_dim) { utf8 = "UTF8", string = "UTF8", large_utf8 = "UTF8", - # date32 = "date32", + # based on what TileDB supports + date32 = "DATETIME_DAY", # date64 = "date64", # time32 = "time32", # time64 = "time64", # null = "null", - # timestamp = "timestamp", + # based on what TileDB supports with a default msec res. + timestamp = "DATETIME_MS", # decimal128 = "decimal128", # decimal256 = "decimal256", # struct = "struct", @@ -240,11 +242,10 @@ arrow_schema_from_tiledb_schema <- function(x) { arrow::schema(c(dimfields, attfields)) } -#' Validate external pointer to ArrowArray +#' Validate external pointer to ArrowArray which is embedded in a nanoarrow S3 type #' @noRd check_arrow_pointers <- function(arrlst) { - stopifnot("First argument must be an external pointer to ArrowArray" = check_arrow_array_tag(arrlst[[1]]), - "Second argument must be an external pointer to ArrowSchema" = check_arrow_schema_tag(arrlst[[2]])) + stopifnot(inherits(arrlst, "nanoarrow_array")) } #' Validate compatibility of Arrow data types diff --git a/apis/r/R/utils-readerTransformers.R b/apis/r/R/utils-readerTransformers.R index 0256b71146..ea52122adb 100644 --- a/apis/r/R/utils-readerTransformers.R +++ b/apis/r/R/utils-readerTransformers.R @@ -2,14 +2,13 @@ #' #' @description Converts the results of a \link{soma_array_reader} or #' \link{sr_next} to an arrow::\link[arrow]{Table} -#' @param x A List object with two pointers to Arrow array data and schema +#' @param x A nanoarrow_array object which is itself a wrapper around the external pointer +#' to the Arrow array data; the schema external pointer is added to it as well #' @return arrow::\link[arrow]{Table} #' @noRd soma_array_to_arrow_table <- function(x) { check_arrow_pointers(x) - arrow::as_arrow_table( - arrow::RecordBatch$import_from_c(x$array_data, x$schema) - ) + arrow::as_arrow_table(x) } #' Transformer function: Arrow table to Matrix::sparseMatrix diff --git a/apis/r/inst/include/tiledbsoma_types.h b/apis/r/inst/include/tiledbsoma_types.h index af27a1feb8..7bf0c44412 100644 --- a/apis/r/inst/include/tiledbsoma_types.h +++ b/apis/r/inst/include/tiledbsoma_types.h @@ -15,7 +15,7 @@ #define TILEDB_NO_API_DEPRECATION_WARNINGS #endif -#include // for C interface to Arrow +#include // for C interface to Arrow #include // for QueryCondition etc #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 #include diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 25652298d9..84cdc2ac19 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -12,7 +12,7 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // soma_array_reader -Rcpp::List soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); +SEXP soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); RcppExport SEXP _tiledbsoma_soma_array_reader(SEXP uriSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP loglevelSEXP, SEXP configSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -129,8 +129,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// create_empty_arrow_table +SEXP create_empty_arrow_table(); +RcppExport SEXP _tiledbsoma_create_empty_arrow_table() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(create_empty_arrow_table()); + return rcpp_result_gen; +END_RCPP +} // sr_next -Rcpp::List sr_next(Rcpp::XPtr sr); +SEXP sr_next(Rcpp::XPtr sr); RcppExport SEXP _tiledbsoma_sr_next(SEXP srSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -220,6 +230,7 @@ static const R_CallMethodDef CallEntries[] = { {"_tiledbsoma_shape", (DL_FUNC) &_tiledbsoma_shape, 2}, {"_tiledbsoma_sr_setup", (DL_FUNC) &_tiledbsoma_sr_setup, 10}, {"_tiledbsoma_sr_complete", (DL_FUNC) &_tiledbsoma_sr_complete, 1}, + {"_tiledbsoma_create_empty_arrow_table", (DL_FUNC) &_tiledbsoma_create_empty_arrow_table, 0}, {"_tiledbsoma_sr_next", (DL_FUNC) &_tiledbsoma_sr_next, 1}, {"_tiledbsoma_tiledbsoma_stats_enable", (DL_FUNC) &_tiledbsoma_tiledbsoma_stats_enable, 0}, {"_tiledbsoma_tiledbsoma_stats_disable", (DL_FUNC) &_tiledbsoma_tiledbsoma_stats_disable, 0}, diff --git a/apis/r/src/nanoarrow.h b/apis/r/src/nanoarrow.h deleted file mode 100644 index 90ce2dc06a..0000000000 --- a/apis/r/src/nanoarrow.h +++ /dev/null @@ -1,3097 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_BUILD_ID_H_INCLUDED -#define NANOARROW_BUILD_ID_H_INCLUDED - -#define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 2 -#define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.2.0-SNAPSHOT" - -#define NANOARROW_VERSION_INT \ - (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ - NANOARROW_VERSION_PATCH) - -// #define NANOARROW_NAMESPACE YourNamespaceHere - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED -#define NANOARROW_NANOARROW_TYPES_H_INCLUDED - -#include -#include - - - -#ifdef __cplusplus -extern "C" { -#endif - -// Extra guard for versions of Arrow without the canonical guard -#ifndef ARROW_FLAG_DICTIONARY_ORDERED - -/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface -/// -/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) -/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) -/// interfaces are part of the -/// Arrow Columnar Format specification -/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for -/// documentation of these structures. -/// -/// @{ - -#ifndef ARROW_C_DATA_INTERFACE -#define ARROW_C_DATA_INTERFACE - -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 - -struct ArrowSchema { - // Array type description - const char* format; - const char* name; - const char* metadata; - int64_t flags; - int64_t n_children; - struct ArrowSchema** children; - struct ArrowSchema* dictionary; - - // Release callback - void (*release)(struct ArrowSchema*); - // Opaque producer-specific data - void* private_data; -}; - -struct ArrowArray { - // Array data description - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void** buffers; - struct ArrowArray** children; - struct ArrowArray* dictionary; - - // Release callback - void (*release)(struct ArrowArray*); - // Opaque producer-specific data - void* private_data; -}; - -#endif // ARROW_C_DATA_INTERFACE - -#ifndef ARROW_C_STREAM_INTERFACE -#define ARROW_C_STREAM_INTERFACE - -struct ArrowArrayStream { - // Callback to get the stream type - // (will be the same for all arrays in the stream). - // - // Return value: 0 if successful, an `errno`-compatible error code otherwise. - // - // If successful, the ArrowSchema must be released independently from the stream. - int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); - - // Callback to get the next array - // (if no error and the array is released, the stream has ended) - // - // Return value: 0 if successful, an `errno`-compatible error code otherwise. - // - // If successful, the ArrowArray must be released independently from the stream. - int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); - - // Callback to get optional detailed error information. - // This must only be called if the last stream operation failed - // with a non-0 return code. - // - // Return value: pointer to a null-terminated character array describing - // the last error, or NULL if no description is available. - // - // The returned pointer is only valid until the next operation on this stream - // (including release). - const char* (*get_last_error)(struct ArrowArrayStream*); - - // Release callback: release the stream's own resources. - // Note that arrays returned by `get_next` must be individually released. - void (*release)(struct ArrowArrayStream*); - - // Opaque producer-specific data - void* private_data; -}; - -#endif // ARROW_C_STREAM_INTERFACE -#endif // ARROW_FLAG_DICTIONARY_ORDERED - -/// \brief Move the contents of src into dst and set src->release to NULL -static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { - memcpy(dst, src, sizeof(struct ArrowSchema)); - src->release = NULL; -} - -/// \brief Move the contents of src into dst and set src->release to NULL -static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { - memcpy(dst, src, sizeof(struct ArrowArray)); - src->release = NULL; -} - -/// \brief Move the contents of src into dst and set src->release to NULL -static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, - struct ArrowArrayStream* dst) { - memcpy(dst, src, sizeof(struct ArrowArrayStream)); - src->release = NULL; -} - -/// @} - -// Utility macros -#define _NANOARROW_CONCAT(x, y) x##y -#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) - -#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) return NAME; \ - } while (0) - -#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ - NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) - -/// \brief Return code for success. -/// \ingroup nanoarrow-errors -#define NANOARROW_OK 0 - -/// \brief Represents an errno-compatible error code -/// \ingroup nanoarrow-errors -typedef int ArrowErrorCode; - -/// \brief Check the result of an expression and return it if not NANOARROW_OK -/// \ingroup nanoarrow-errors -#define NANOARROW_RETURN_NOT_OK(EXPR) \ - _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) - -static char _ArrowIsLittleEndian(void) { - uint32_t check = 1; - char first_byte; - memcpy(&first_byte, &check, sizeof(char)); - return first_byte; -} - -/// \brief Arrow type enumerator -/// \ingroup nanoarrow-utils -/// -/// These names are intended to map to the corresponding arrow::Type::type -/// enumerator; however, the numeric values are specifically not equal -/// (i.e., do not rely on numeric comparison). -enum ArrowType { - NANOARROW_TYPE_UNINITIALIZED = 0, - NANOARROW_TYPE_NA = 1, - NANOARROW_TYPE_BOOL, - NANOARROW_TYPE_UINT8, - NANOARROW_TYPE_INT8, - NANOARROW_TYPE_UINT16, - NANOARROW_TYPE_INT16, - NANOARROW_TYPE_UINT32, - NANOARROW_TYPE_INT32, - NANOARROW_TYPE_UINT64, - NANOARROW_TYPE_INT64, - NANOARROW_TYPE_HALF_FLOAT, - NANOARROW_TYPE_FLOAT, - NANOARROW_TYPE_DOUBLE, - NANOARROW_TYPE_STRING, - NANOARROW_TYPE_BINARY, - NANOARROW_TYPE_FIXED_SIZE_BINARY, - NANOARROW_TYPE_DATE32, - NANOARROW_TYPE_DATE64, - NANOARROW_TYPE_TIMESTAMP, - NANOARROW_TYPE_TIME32, - NANOARROW_TYPE_TIME64, - NANOARROW_TYPE_INTERVAL_MONTHS, - NANOARROW_TYPE_INTERVAL_DAY_TIME, - NANOARROW_TYPE_DECIMAL128, - NANOARROW_TYPE_DECIMAL256, - NANOARROW_TYPE_LIST, - NANOARROW_TYPE_STRUCT, - NANOARROW_TYPE_SPARSE_UNION, - NANOARROW_TYPE_DENSE_UNION, - NANOARROW_TYPE_DICTIONARY, - NANOARROW_TYPE_MAP, - NANOARROW_TYPE_EXTENSION, - NANOARROW_TYPE_FIXED_SIZE_LIST, - NANOARROW_TYPE_DURATION, - NANOARROW_TYPE_LARGE_STRING, - NANOARROW_TYPE_LARGE_BINARY, - NANOARROW_TYPE_LARGE_LIST, - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO -}; - -/// \brief Get a string value of an enum ArrowType value -/// \ingroup nanoarrow-utils -/// -/// Returns NULL for invalid values for type -static inline const char* ArrowTypeString(enum ArrowType type) { - switch (type) { - case NANOARROW_TYPE_NA: - return "na"; - case NANOARROW_TYPE_BOOL: - return "bool"; - case NANOARROW_TYPE_UINT8: - return "uint8"; - case NANOARROW_TYPE_INT8: - return "int8"; - case NANOARROW_TYPE_UINT16: - return "uint16"; - case NANOARROW_TYPE_INT16: - return "int16"; - case NANOARROW_TYPE_UINT32: - return "uint32"; - case NANOARROW_TYPE_INT32: - return "int32"; - case NANOARROW_TYPE_UINT64: - return "uint64"; - case NANOARROW_TYPE_INT64: - return "int64"; - case NANOARROW_TYPE_HALF_FLOAT: - return "half_float"; - case NANOARROW_TYPE_FLOAT: - return "float"; - case NANOARROW_TYPE_DOUBLE: - return "double"; - case NANOARROW_TYPE_STRING: - return "string"; - case NANOARROW_TYPE_BINARY: - return "binary"; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - return "fixed_size_binary"; - case NANOARROW_TYPE_DATE32: - return "date32"; - case NANOARROW_TYPE_DATE64: - return "date64"; - case NANOARROW_TYPE_TIMESTAMP: - return "timestamp"; - case NANOARROW_TYPE_TIME32: - return "time32"; - case NANOARROW_TYPE_TIME64: - return "time64"; - case NANOARROW_TYPE_INTERVAL_MONTHS: - return "interval_months"; - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - return "interval_day_time"; - case NANOARROW_TYPE_DECIMAL128: - return "decimal128"; - case NANOARROW_TYPE_DECIMAL256: - return "decimal256"; - case NANOARROW_TYPE_LIST: - return "list"; - case NANOARROW_TYPE_STRUCT: - return "struct"; - case NANOARROW_TYPE_SPARSE_UNION: - return "sparse_union"; - case NANOARROW_TYPE_DENSE_UNION: - return "dense_union"; - case NANOARROW_TYPE_DICTIONARY: - return "dictionary"; - case NANOARROW_TYPE_MAP: - return "map"; - case NANOARROW_TYPE_EXTENSION: - return "extension"; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - return "fixed_size_list"; - case NANOARROW_TYPE_DURATION: - return "duration"; - case NANOARROW_TYPE_LARGE_STRING: - return "large_string"; - case NANOARROW_TYPE_LARGE_BINARY: - return "large_binary"; - case NANOARROW_TYPE_LARGE_LIST: - return "large_list"; - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - return "interval_month_day_nano"; - default: - return NULL; - } -} - -/// \brief Arrow time unit enumerator -/// \ingroup nanoarrow-utils -/// -/// These names and values map to the corresponding arrow::TimeUnit::type -/// enumerator. -enum ArrowTimeUnit { - NANOARROW_TIME_UNIT_SECOND = 0, - NANOARROW_TIME_UNIT_MILLI = 1, - NANOARROW_TIME_UNIT_MICRO = 2, - NANOARROW_TIME_UNIT_NANO = 3 -}; - -/// \brief Validation level enumerator -/// \ingroup nanoarrow-array -enum ArrowValidationLevel { - /// \brief Do not validate buffer sizes or content. - NANOARROW_VALIDATION_LEVEL_NONE = 0, - - /// \brief Validate buffer sizes that depend on array length but do not validate buffer - /// sizes that depend on buffer data access. - NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, - - /// \brief Validate all buffer sizes, including those that require buffer data access, - /// but do not perform any checks that are O(1) along the length of the buffers. - NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, - - /// \brief Validate all buffer sizes and all buffer content. This is useful in the - /// context of untrusted input or input that may have been corrupted in transit. - NANOARROW_VALIDATION_LEVEL_FULL = 3 -}; - -/// \brief Get a string value of an enum ArrowTimeUnit value -/// \ingroup nanoarrow-utils -/// -/// Returns NULL for invalid values for time_unit -static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { - switch (time_unit) { - case NANOARROW_TIME_UNIT_SECOND: - return "s"; - case NANOARROW_TIME_UNIT_MILLI: - return "ms"; - case NANOARROW_TIME_UNIT_MICRO: - return "us"; - case NANOARROW_TIME_UNIT_NANO: - return "ns"; - default: - return NULL; - } -} - -/// \brief Functional types of buffers as described in the Arrow Columnar Specification -/// \ingroup nanoarrow-array-view -enum ArrowBufferType { - NANOARROW_BUFFER_TYPE_NONE, - NANOARROW_BUFFER_TYPE_VALIDITY, - NANOARROW_BUFFER_TYPE_TYPE_ID, - NANOARROW_BUFFER_TYPE_UNION_OFFSET, - NANOARROW_BUFFER_TYPE_DATA_OFFSET, - NANOARROW_BUFFER_TYPE_DATA -}; - -/// \brief An non-owning view of a string -/// \ingroup nanoarrow-utils -struct ArrowStringView { - /// \brief A pointer to the start of the string - /// - /// If size_bytes is 0, this value may be NULL. - const char* data; - - /// \brief The size of the string in bytes, - /// - /// (Not including the null terminator.) - int64_t size_bytes; -}; - -/// \brief Return a view of a const C string -/// \ingroup nanoarrow-utils -static inline struct ArrowStringView ArrowCharView(const char* value) { - struct ArrowStringView out; - - out.data = value; - if (value) { - out.size_bytes = (int64_t)strlen(value); - } else { - out.size_bytes = 0; - } - - return out; -} - -/// \brief An non-owning view of a buffer -/// \ingroup nanoarrow-utils -struct ArrowBufferView { - /// \brief A pointer to the start of the buffer - /// - /// If size_bytes is 0, this value may be NULL. - union { - const void* data; - const int8_t* as_int8; - const uint8_t* as_uint8; - const int16_t* as_int16; - const uint16_t* as_uint16; - const int32_t* as_int32; - const uint32_t* as_uint32; - const int64_t* as_int64; - const uint64_t* as_uint64; - const double* as_double; - const float* as_float; - const char* as_char; - } data; - - /// \brief The size of the buffer in bytes - int64_t size_bytes; -}; - -/// \brief Array buffer allocation and deallocation -/// \ingroup nanoarrow-buffer -/// -/// Container for allocate, reallocate, and free methods that can be used -/// to customize allocation and deallocation of buffers when constructing -/// an ArrowArray. -struct ArrowBufferAllocator { - /// \brief Reallocate a buffer or return NULL if it cannot be reallocated - uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t old_size, int64_t new_size); - - /// \brief Deallocate a buffer allocated by this allocator - void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); - - /// \brief Opaque data specific to the allocator - void* private_data; -}; - -/// \brief An owning mutable view of a buffer -/// \ingroup nanoarrow-buffer -struct ArrowBuffer { - /// \brief A pointer to the start of the buffer - /// - /// If capacity_bytes is 0, this value may be NULL. - uint8_t* data; - - /// \brief The size of the buffer in bytes - int64_t size_bytes; - - /// \brief The capacity of the buffer in bytes - int64_t capacity_bytes; - - /// \brief The allocator that will be used to reallocate and/or free the buffer - struct ArrowBufferAllocator allocator; -}; - -/// \brief An owning mutable view of a bitmap -/// \ingroup nanoarrow-bitmap -struct ArrowBitmap { - /// \brief An ArrowBuffer to hold the allocated memory - struct ArrowBuffer buffer; - - /// \brief The number of bits that have been appended to the bitmap - int64_t size_bits; -}; - -/// \brief A description of an arrangement of buffers -/// \ingroup nanoarrow-utils -/// -/// Contains the minimum amount of information required to -/// calculate the size of each buffer in an ArrowArray knowing only -/// the length and offset of the array. -struct ArrowLayout { - /// \brief The function of each buffer - enum ArrowBufferType buffer_type[3]; - - /// \brief The size of an element each buffer or 0 if this size is variable or unknown - int64_t element_size_bits[3]; - - /// \brief The number of elements in the child array per element in this array for a - /// fixed-size list - int64_t child_size_elements; -}; - -/// \brief A non-owning view of an ArrowArray -/// \ingroup nanoarrow-array-view -/// -/// This data structure provides access to the values contained within -/// an ArrowArray with fields provided in a more readily-extractible -/// form. You can re-use an ArrowArrayView for multiple ArrowArrays -/// with the same storage type, or use it to represent a hypothetical -/// ArrowArray that does not exist yet. -struct ArrowArrayView { - /// \brief The underlying ArrowArray or NULL if it has not been set - struct ArrowArray* array; - - /// \brief The type used to store values in this array - /// - /// This type represents only the minimum required information to - /// extract values from the array buffers (e.g., for a Date32 array, - /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded - /// arrays, this will be the index type. - enum ArrowType storage_type; - - /// \brief The buffer types, strides, and sizes of this Array's buffers - struct ArrowLayout layout; - - /// \brief This Array's buffers as ArrowBufferView objects - struct ArrowBufferView buffer_views[3]; - - /// \brief The number of children of this view - int64_t n_children; - - /// \brief Pointers to views of this array's children - struct ArrowArrayView** children; - - /// \brief Union type id to child index mapping - /// - /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer - /// such that child_index == union_type_id_map[type_id] and - /// type_id == union_type_id_map[128 + child_index]. This value may be - /// NULL in the case where child_id == type_id. - int8_t* union_type_id_map; -}; - -// Used as the private data member for ArrowArrays allocated here and accessed -// internally within inline ArrowArray* helpers. -struct ArrowArrayPrivateData { - // Holder for the validity buffer (or first buffer for union types, which are - // the only type whose first buffer is not a valdiity buffer) - struct ArrowBitmap bitmap; - - // Holder for additional buffers as required - struct ArrowBuffer buffers[2]; - - // The array of pointers to buffers. This must be updated after a sequence - // of appends to synchronize its values with the actual buffer addresses - // (which may have ben reallocated uring that time) - const void* buffer_data[3]; - - // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown - enum ArrowType storage_type; - - // The buffer arrangement for the storage type - struct ArrowLayout layout; - - // Flag to indicate if there are non-sequence union type ids. - // In the future this could be replaced with a type id<->child mapping - // to support constructing unions in append mode where type_id != child_index - int8_t union_type_id_is_child_index; -}; - -/// \brief A representation of a fixed-precision decimal number -/// \ingroup nanoarrow-utils -/// -/// This structure should be initialized with ArrowDecimalInit() once and -/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), -/// or ArrowDecimalSetBytes256(). -struct ArrowDecimal { - /// \brief An array of 64-bit integers of n_words length defined in native-endian order - uint64_t words[4]; - - /// \brief The number of significant digits this decimal number can represent - int32_t precision; - - /// \brief The number of digits after the decimal point. This can be negative. - int32_t scale; - - /// \brief The number of words in the words array - int n_words; - - /// \brief Cached value used by the implementation - int high_word_index; - - /// \brief Cached value used by the implementation - int low_word_index; -}; - -/// \brief Initialize a decimal with a given set of type parameters -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, - int32_t precision, int32_t scale) { - memset(decimal->words, 0, sizeof(decimal->words)); - decimal->precision = precision; - decimal->scale = scale; - decimal->n_words = bitwidth / 8 / sizeof(uint64_t); - - if (_ArrowIsLittleEndian()) { - decimal->low_word_index = 0; - decimal->high_word_index = decimal->n_words - 1; - } else { - decimal->low_word_index = decimal->n_words - 1; - decimal->high_word_index = 0; - } -} - -/// \brief Get a signed integer value of a sufficiently small ArrowDecimal -/// -/// This does not check if the decimal's precision sufficiently small to fit -/// within the signed 64-bit integer range (A precision less than or equal -/// to 18 is sufficiently small). -static inline int64_t ArrowDecimalGetIntUnsafe(struct ArrowDecimal* decimal) { - return (int64_t)decimal->words[decimal->low_word_index]; -} - -/// \brief Copy the bytes of this decimal into a sufficiently large buffer -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalGetBytes(struct ArrowDecimal* decimal, uint8_t* out) { - memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); -} - -/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise -/// \ingroup nanoarrow-utils -static inline int64_t ArrowDecimalSign(struct ArrowDecimal* decimal) { - return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); -} - -/// \brief Sets the integer value of this decimal -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { - if (value < 0) { - memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); - } else { - memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); - } - - decimal->words[decimal->low_word_index] = value; -} - -/// \brief Copy bytes from a buffer into this decimal -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, - const uint8_t* value) { - memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); -} - -#ifdef __cplusplus -} -#endif - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_H_INCLUDED -#define NANOARROW_H_INCLUDED - -#include -#include -#include - - - -// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this -// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE -// MyNamespace here. - -// This section remaps the non-prefixed symbols to the prefixed symbols so that -// code written against this build can be used independent of the value of -// NANOARROW_NAMESPACE. -#ifdef NANOARROW_NAMESPACE -#define NANOARROW_CAT(A, B) A##B -#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) - -#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) -#define ArrowNanoarrowVersionInt \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) -#define ArrowErrorMessage NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorMessage) -#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) -#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) -#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) -#define ArrowBufferAllocatorDefault \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) -#define ArrowBufferDeallocator \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) -#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) -#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) -#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) -#define ArrowSchemaInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) -#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) -#define ArrowSchemaSetTypeStruct \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) -#define ArrowSchemaSetTypeFixedSize \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) -#define ArrowSchemaSetTypeDecimal \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) -#define ArrowSchemaSetTypeDateTime \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) -#define ArrowSchemaSetTypeUnion \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) -#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) -#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) -#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) -#define ArrowSchemaSetMetadata \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) -#define ArrowSchemaAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) -#define ArrowSchemaAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) -#define ArrowMetadataReaderInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) -#define ArrowMetadataReaderRead \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) -#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) -#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) -#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) -#define ArrowMetadataBuilderInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) -#define ArrowMetadataBuilderAppend \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) -#define ArrowMetadataBuilderSet \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) -#define ArrowMetadataBuilderRemove \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) -#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) -#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) -#define ArrowArrayInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) -#define ArrowArrayInitFromSchema \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) -#define ArrowArrayAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) -#define ArrowArrayAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) -#define ArrowArraySetValidityBitmap \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) -#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) -#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) -#define ArrowArrayFinishBuilding \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) -#define ArrowArrayFinishBuildingDefault \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) -#define ArrowArrayViewInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) -#define ArrowArrayViewInitFromSchema \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) -#define ArrowArrayViewAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) -#define ArrowArrayViewSetLength \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) -#define ArrowArrayViewSetArray \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) -#define ArrowArrayViewValidateFull \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidateFull) -#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) -#define ArrowBasicArrayStreamInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) -#define ArrowBasicArrayStreamSetArray \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) -#define ArrowBasicArrayStreamValidate \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) - -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/// \defgroup nanoarrow Nanoarrow C library -/// -/// Except where noted, objects are not thread-safe and clients should -/// take care to serialize accesses to methods. -/// -/// Because this library is intended to be vendored, it provides full type -/// definitions and encourages clients to stack or statically allocate -/// where convenient. - -/// \defgroup nanoarrow-malloc Memory management -/// -/// Non-buffer members of a struct ArrowSchema and struct ArrowArray -/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed -/// using ArrowFree() for schemas and arrays allocated here. Buffer members -/// are allocated using an ArrowBufferAllocator. -/// -/// @{ - -/// \brief Allocate like malloc() -void* ArrowMalloc(int64_t size); - -/// \brief Reallocate like realloc() -void* ArrowRealloc(void* ptr, int64_t size); - -/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). -void ArrowFree(void* ptr); - -/// \brief Return the default allocator -/// -/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and -/// ArrowFree(). -struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); - -/// \brief Create a custom deallocator -/// -/// Creates a buffer allocator with only a free method that can be used to -/// attach a custom deallocator to an ArrowBuffer. This may be used to -/// avoid copying an existing buffer that was not allocated using the -/// infrastructure provided here (e.g., by an R or Python object). -struct ArrowBufferAllocator ArrowBufferDeallocator( - void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t size), - void* private_data); - -/// @} - -/// \defgroup nanoarrow-errors Error handling -/// -/// Functions generally return an errno-compatible error code; functions that -/// need to communicate more verbose error information accept a pointer -/// to an ArrowError. This can be stack or statically allocated. The -/// content of the message is undefined unless an error code has been -/// returned. -/// -/// @{ - -/// \brief Error type containing a UTF-8 encoded message. -struct ArrowError { - /// \brief A character buffer with space for an error message. - char message[1024]; -}; - -/// \brief Set the contents of an error using printf syntax -ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...); - -/// \brief Get the contents of an error -const char* ArrowErrorMessage(struct ArrowError* error); - -/// @} - -/// \defgroup nanoarrow-utils Utility data structures -/// -/// @{ - -/// \brief Return a version string in the form "major.minor.patch" -const char* ArrowNanoarrowVersion(void); - -/// \brief Return an integer that can be used to compare versions sequentially -int ArrowNanoarrowVersionInt(void); - -/// \brief Initialize a description of buffer arrangements from a storage type -void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); - -/// \brief Create a string view from a null-terminated string -static inline struct ArrowStringView ArrowCharView(const char* value); - -/// @} - -/// \defgroup nanoarrow-schema Creating schemas -/// -/// These functions allocate, copy, and destroy ArrowSchema structures -/// -/// @{ - -/// \brief Initialize an ArrowSchema -/// -/// Initializes the fields and release callback of schema_out. Caller -/// is responsible for calling the schema->release callback if -/// NANOARROW_OK is returned. -void ArrowSchemaInit(struct ArrowSchema* schema); - -/// \brief Initialize an ArrowSchema from an ArrowType -/// -/// A convenience constructor for that calls ArrowSchemaInit() and -/// ArrowSchemaSetType() for the common case of constructing an -/// unparameterized type. The caller is responsible for calling the schema->release -/// callback if NANOARROW_OK is returned. -ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); - -/// \brief Get a human-readable summary of a Schema -/// -/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) -/// and returns the number of characters required for the output if -/// n were sufficiently large. If recursive is non-zero, the result will -/// also include children. -int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, - char recursive); - -/// \brief Set the format field of a schema from an ArrowType -/// -/// Initializes the fields and release callback of schema_out. For -/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and -/// NANOARROW_TYPE_MAP, the appropriate number of children are -/// allocated, initialized, and named; however, the caller must -/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized -/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); - -/// \brief Set the format field and initialize children of a struct schema -/// -/// The specified number of children are initialized; however, the caller is responsible -/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. -/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); - -/// \brief Set the format field of a fixed-size schema -/// -/// Returns EINVAL for fixed_size <= 0 or for type that is not -/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. -/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are -/// allocated, initialized, and named; however, the caller must -/// ArrowSchemaSetType() the first child. Schema must have been initialized using -/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, - enum ArrowType type, int32_t fixed_size); - -/// \brief Set the format field of a decimal schema -/// -/// Returns EINVAL for scale <= 0 or for type that is not -/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been -/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, - int32_t decimal_precision, - int32_t decimal_scale); - -/// \brief Set the format field of a time, timestamp, or duration schema -/// -/// Returns EINVAL for type that is not -/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, -/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The -/// timezone parameter must be NULL for a non-timestamp type. Schema must have been -/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, - enum ArrowTimeUnit time_unit, - const char* timezone); - -/// \brief Seet the format field of a union schema -/// -/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION -/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are -/// allocated, and initialized. -ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, - int64_t n_children); - -/// \brief Make a (recursive) copy of a schema -/// -/// Allocates and copies fields of schema into schema_out. -ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, - struct ArrowSchema* schema_out); - -/// \brief Copy format into schema->format -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); - -/// \brief Copy name into schema->name -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); - -/// \brief Copy metadata into schema->metadata -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy. -ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); - -/// \brief Allocate the schema->children array -/// -/// Includes the memory for each child struct ArrowSchema. -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, - int64_t n_children); - -/// \brief Allocate the schema->dictionary member -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); - -/// @} - -/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata -/// -/// @{ - -/// \brief Reader for key/value pairs in schema metadata -/// -/// The ArrowMetadataReader does not own any data and is only valid -/// for the lifetime of the underlying metadata pointer. -struct ArrowMetadataReader { - /// \brief A metadata string from a schema->metadata field. - const char* metadata; - - /// \brief The current offset into the metadata string - int64_t offset; - - /// \brief The number of remaining keys - int32_t remaining_keys; -}; - -/// \brief Initialize an ArrowMetadataReader -ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, - const char* metadata); - -/// \brief Read the next key/value pair from an ArrowMetadataReader -ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, - struct ArrowStringView* key_out, - struct ArrowStringView* value_out); - -/// \brief The number of bytes in in a key/value metadata string -int64_t ArrowMetadataSizeOf(const char* metadata); - -/// \brief Check for a key in schema metadata -char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); - -/// \brief Extract a value from schema metadata -/// -/// If key does not exist in metadata, value_out is unmodified -ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, - struct ArrowStringView* value_out); - -/// \brief Initialize a builder for schema metadata from key/value pairs -/// -/// metadata can be an existing metadata string or NULL to initialize -/// an empty metadata string. -ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); - -/// \brief Append a key/value pair to a buffer containing serialized metadata -ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value); - -/// \brief Set a key/value pair to a buffer containing serialized metadata -/// -/// Ensures that the only entry for key in the metadata is set to value. -/// This function maintains the existing position of (the first instance of) -/// key if present in the data. -ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value); - -/// \brief Remove a key from a buffer containing serialized metadata -ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, - struct ArrowStringView key); - -/// @} - -/// \defgroup nanoarrow-schema-view Reading schemas -/// -/// @{ - -/// \brief A non-owning view of a parsed ArrowSchema -/// -/// Contains more readily extractable values than a raw ArrowSchema. -/// Clients can stack or statically allocate this structure but are -/// encouraged to use the provided getters to ensure forward -/// compatiblity. -struct ArrowSchemaView { - /// \brief A pointer to the schema represented by this view - struct ArrowSchema* schema; - - /// \brief The data type represented by the schema - /// - /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a - /// non-null dictionary member; datetime types are valid values. - /// This value will never be NANOARROW_TYPE_EXTENSION (see - /// extension_name and/or extension_metadata to check for - /// an extension type). - enum ArrowType type; - - /// \brief The storage data type represented by the schema - /// - /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION - /// or any datetime type. This value represents only the type required to - /// interpret the buffers in the array. - enum ArrowType storage_type; - - /// \brief The storage layout represented by the schema - struct ArrowLayout layout; - - /// \brief The extension type name if it exists - /// - /// If the ARROW:extension:name key is present in schema.metadata, - /// extension_name.data will be non-NULL. - struct ArrowStringView extension_name; - - /// \brief The extension type metadata if it exists - /// - /// If the ARROW:extension:metadata key is present in schema.metadata, - /// extension_metadata.data will be non-NULL. - struct ArrowStringView extension_metadata; - - /// \brief Format fixed size parameter - /// - /// This value is set when parsing a fixed-size binary or fixed-size - /// list schema; this value is undefined for other types. For a - /// fixed-size binary schema this value is in bytes; for a fixed-size - /// list schema this value refers to the number of child elements for - /// each element of the parent. - int32_t fixed_size; - - /// \brief Decimal bitwidth - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_bitwidth; - - /// \brief Decimal precision - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_precision; - - /// \brief Decimal scale - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_scale; - - /// \brief Format time unit parameter - /// - /// This value is set when parsing a date/time type. The value is - /// undefined for other types. - enum ArrowTimeUnit time_unit; - - /// \brief Format timezone parameter - /// - /// This value is set when parsing a timestamp type and represents - /// the timezone format parameter. This value points to - /// data within the schema and is undefined for other types. - const char* timezone; - - /// \brief Union type ids parameter - /// - /// This value is set when parsing a union type and represents - /// type ids parameter. This value points to - /// data within the schema and is undefined for other types. - const char* union_type_ids; -}; - -/// \brief Initialize an ArrowSchemaView -ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - struct ArrowSchema* schema, struct ArrowError* error); - -/// @} - -/// \defgroup nanoarrow-buffer Owning, growable buffers -/// -/// @{ - -/// \brief Initialize an ArrowBuffer -/// -/// Initialize a buffer with a NULL, zero-size buffer using the default -/// buffer allocator. -static inline void ArrowBufferInit(struct ArrowBuffer* buffer); - -/// \brief Set a newly-initialized buffer's allocator -/// -/// Returns EINVAL if the buffer has already been allocated. -static inline ArrowErrorCode ArrowBufferSetAllocator( - struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); - -/// \brief Reset an ArrowBuffer -/// -/// Releases the buffer using the allocator's free method if -/// the buffer's data member is non-null, sets the data member -/// to NULL, and sets the buffer's size and capacity to 0. -static inline void ArrowBufferReset(struct ArrowBuffer* buffer); - -/// \brief Move an ArrowBuffer -/// -/// Transfers the buffer data and lifecycle management to another -/// address and resets buffer. -static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); - -/// \brief Grow or shrink a buffer to a given capacity -/// -/// When shrinking the capacity of the buffer, the buffer is only reallocated -/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not -/// adjust the buffer's size member except to ensure that the invariant -/// capacity >= size remains true. -static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, - char shrink_to_fit); - -/// \brief Ensure a buffer has at least a given additional capacity -/// -/// Ensures that the buffer has space to append at least -/// additional_size_bytes, overallocating when required. -static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, - int64_t additional_size_bytes); - -/// \brief Write data to buffer and increment the buffer size -/// -/// This function does not check that buffer has the required capacity -static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, - int64_t size_bytes); - -/// \brief Write data to buffer and increment the buffer size -/// -/// This function writes and ensures that the buffer has the required capacity, -/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will -/// overallocate when reallocation is required. -static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, - const void* data, int64_t size_bytes); - -/// \brief Write fill to buffer and increment the buffer size -/// -/// This function writes the specified number of fill bytes and -/// ensures that the buffer has the required capacity, -static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, - uint8_t value, int64_t size_bytes); - -/// \brief Write an 8-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, - int8_t value); - -/// \brief Write an unsigned 8-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, - uint8_t value); - -/// \brief Write a 16-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, - int16_t value); - -/// \brief Write an unsigned 16-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, - uint16_t value); - -/// \brief Write a 32-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, - int32_t value); - -/// \brief Write an unsigned 32-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, - uint32_t value); - -/// \brief Write a 64-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, - int64_t value); - -/// \brief Write an unsigned 64-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, - uint64_t value); - -/// \brief Write a double to a buffer -static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, - double value); - -/// \brief Write a float to a buffer -static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, - float value); - -/// \brief Write an ArrowStringView to a buffer -static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, - struct ArrowStringView value); - -/// \brief Write an ArrowBufferView to a buffer -static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, - struct ArrowBufferView value); - -/// @} - -/// \defgroup nanoarrow-bitmap Bitmap utilities -/// -/// @{ - -/// \brief Extract a boolean value from a bitmap -static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); - -/// \brief Set a boolean value to a bitmap to true -static inline void ArrowBitSet(uint8_t* bits, int64_t i); - -/// \brief Set a boolean value to a bitmap to false -static inline void ArrowBitClear(uint8_t* bits, int64_t i); - -/// \brief Set a boolean value to a bitmap -static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); - -/// \brief Set a boolean value to a range in a bitmap -static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, - uint8_t bits_are_set); - -/// \brief Count true values in a bitmap -static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); - -/// \brief Initialize an ArrowBitmap -/// -/// Initialize the builder's buffer, empty its cache, and reset the size to zero -static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); - -/// \brief Move an ArrowBitmap -/// -/// Transfers the underlying buffer data and lifecycle management to another -/// address and resets the bitmap. -static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); - -/// \brief Ensure a bitmap builder has at least a given additional capacity -/// -/// Ensures that the buffer has space to append at least -/// additional_size_bits, overallocating when required. -static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, - int64_t additional_size_bits); - -/// \brief Grow or shrink a bitmap to a given capacity -/// -/// When shrinking the capacity of the bitmap, the bitmap is only reallocated -/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not -/// adjust the buffer's size member except when shrinking new_capacity_bits -/// to a value less than the current number of bits in the bitmap. -static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit); - -/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap -static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length); - -/// \brief Append zero or more of the same boolean value to a bitmap -static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length); - -/// \brief Append boolean values encoded as int8_t to a bitmap -/// -/// The values must all be 0 or 1. -static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - const int8_t* values, int64_t n_values); - -/// \brief Append boolean values encoded as int32_t to a bitmap -/// -/// The values must all be 0 or 1. -static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, - const int32_t* values, int64_t n_values); - -/// \brief Reset a bitmap builder -/// -/// Releases any memory held by buffer, empties the cache, and resets the size to zero -static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); - -/// @} - -/// \defgroup nanoarrow-array Creating arrays -/// -/// These functions allocate, copy, and destroy ArrowArray structures. -/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() -/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing -/// it using the embedded release callback. -/// -/// @{ - -/// \brief Initialize the fields of an array -/// -/// Initializes the fields and release callback of array. Caller -/// is responsible for calling the array->release callback if -/// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, - enum ArrowType storage_type); - -/// \brief Initialize the contents of an ArrowArray from an ArrowSchema -/// -/// Caller is responsible for calling the array->release callback if -/// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - struct ArrowSchema* schema, - struct ArrowError* error); - -/// \brief Allocate the array->children array -/// -/// Includes the memory for each child struct ArrowArray, -/// whose members are marked as released and may be subsequently initialized -/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. -/// schema must have been allocated using ArrowArrayInitFromType(). -ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); - -/// \brief Allocate the array->dictionary member -/// -/// Includes the memory for the struct ArrowArray, whose contents -/// is marked as released and may be subsequently initialized -/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. -/// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); - -/// \brief Set the validity bitmap of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); - -/// \brief Set a buffer of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, - struct ArrowBuffer* buffer); - -/// \brief Get the validity bitmap of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); - -/// \brief Get a buffer of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); - -/// \brief Start element-wise appending to an ArrowArray -/// -/// Initializes any values needed to use ArrowArrayAppend*() functions. -/// All element-wise appenders append by value and return EINVAL if the exact value -/// cannot be represented by the underlying storage type. -/// array must have been allocated using ArrowArrayInitFromType() -static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); - -/// \brief Reserve space for future appends -/// -/// For buffer sizes that can be calculated (i.e., not string data buffers or -/// child array sizes for non-fixed-size arrays), recursively reserve space for -/// additional elements. This is useful for reducing the number of reallocations -/// that occur using the item-wise appenders. -ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, - int64_t additional_size_elements); - -/// \brief Append a null value to an array -static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); - -/// \brief Append an empty, non-null value to an array -static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); - -/// \brief Append a signed integer value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., value -/// is outside the valid array range). -static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); - -/// \brief Append an unsigned integer value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., value -/// is outside the valid array range). -static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, - uint64_t value); - -/// \brief Append a double value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., value -/// is outside the valid array range or there is an attempt to append -/// a non-integer to an array with an integer storage type). -static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, - double value); - -/// \brief Append a string of bytes to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., -/// the underlying array is not a binary, string, large binary, large string, -/// or fixed-size binary array, or value is the wrong size for a fixed-size -/// binary array). -static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, - struct ArrowBufferView value); - -/// \brief Append a string value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., -/// the underlying array is not a string or large string array). -static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, - struct ArrowStringView value); - -/// \brief Append a decimal value to an array -/// -/// Returns NANOARROW_OK if array is a decimal array with the appropriate -/// bitwidth or EINVAL otherwise. -static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - struct ArrowDecimal* value); - -/// \brief Finish a nested array element -/// -/// Appends a non-null element to the array based on the first child's current -/// length. Returns NANOARROW_OK if the item was successfully added or EINVAL -/// if the underlying storage type is not a struct, list, large list, or fixed-size -/// list, or if there was an attempt to add a struct or fixed-size list element where the -/// length of the child array(s) did not match the expected length. -static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); - -/// \brief Finish a union array element -/// -/// Appends an element to the union type ids buffer and increments array->length. -/// For sparse unions, up to one element is added to non type-id children. Returns -/// EINVAL if the underlying storage type is not a union, if type_id is not valid, -/// or if child sizes after appending are inconsistent. -static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, - int8_t type_id); - -/// \brief Shrink buffer capacity to the size required -/// -/// Also applies shrinking to any child arrays. array must have been allocated using -/// ArrowArrayInitFromType -static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); - -/// \brief Finish building an ArrowArray -/// -/// Flushes any pointers from internal buffers that may have been reallocated -/// into array->buffers and checks the actual size of the buffers -/// against the expected size based on the final length. -/// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, - struct ArrowError* error); - -/// \brief Finish building an ArrowArray with explicit validation -/// -/// Finish building with an explicit validation level. This could perform less validation -/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU -/// buffer data access is not possible or more validation (i.e., -/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or -/// corruptable source. -ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, - enum ArrowValidationLevel validation_level, - struct ArrowError* error); - -/// @} - -/// \defgroup nanoarrow-array-view Reading arrays -/// -/// These functions read and validate the contents ArrowArray structures -/// -/// @{ - -/// \brief Initialize the contents of an ArrowArrayView -void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, - enum ArrowType storage_type); - -/// \brief Move an ArrowArrayView -/// -/// Transfers the ArrowArrayView data and lifecycle management to another -/// address and resets the contents of src. -static inline void ArrowArrayViewMove(struct ArrowArrayView* src, - struct ArrowArrayView* dst); - -/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema -ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - struct ArrowSchema* schema, - struct ArrowError* error); - -/// \brief Allocate the schema_view->children array -/// -/// Includes the memory for each child struct ArrowArrayView -ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, - int64_t n_children); - -/// \brief Set data-independent buffer sizes from length -void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); - -/// \brief Set buffer sizes and data pointers from an ArrowArray -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - struct ArrowArray* array, struct ArrowError* error); - -/// \brief Performs extra checks on the array that was set via ArrowArrayViewSetArray() -ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error); - -/// \brief Reset the contents of an ArrowArrayView and frees resources -void ArrowArrayViewReset(struct ArrowArrayView* array_view); - -/// \brief Check for a null element in an ArrowArrayView -static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get the type id of a union array element -static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get the child index of a union array element -static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get the index to use into the relevant union child array -static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get an element in an ArrowArrayView as an integer -/// -/// This function does not check for null values, that values are actually integers, or -/// that values are within a valid range for an int64. -static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get an element in an ArrowArrayView as an unsigned integer -/// -/// This function does not check for null values, that values are actually integers, or -/// that values are within a valid range for a uint64. -static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get an element in an ArrowArrayView as a double -/// -/// This function does not check for null values, or -/// that values are within a valid range for a double. -static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get an element in an ArrowArrayView as an ArrowStringView -/// -/// This function does not check for null values. -static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( - struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as an ArrowBufferView -/// -/// This function does not check for null values. -static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( - struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as an ArrowDecimal -/// -/// This function does not check for null values. The out parameter must -/// be initialized with ArrowDecimalInit() with the proper parameters for this -/// type before calling this for the first time. -static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, - int64_t i, struct ArrowDecimal* out); - -/// @} - -/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation -/// -/// An implementation of an ArrowArrayStream based on a collection of -/// zero or more previously-existing ArrowArray objects. Users should -/// initialize and/or validate the contents before transferring the -/// responsibility of the ArrowArrayStream elsewhere. -/// -/// @{ - -/// \brief Initialize an ArrowArrayStream backed by this implementation -/// -/// This function moves the ownership of schema to the array_stream. If -/// this function returns NANOARROW_OK, the caller is responsible for -/// releasing the ArrowArrayStream. -ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, - struct ArrowSchema* schema, int64_t n_arrays); - -/// \brief Set the ith ArrowArray in this ArrowArrayStream. -/// -/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). -/// This function move the ownership of array to the array_stream. i must -/// be greater than zero and less than the value of n_arrays passed in -/// ArrowBasicArrayStreamInit(). Callers are not required to fill all -/// n_arrays members (i.e., n_arrays is a maximum bound). -void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, - struct ArrowArray* array); - -/// \brief Validate the contents of this ArrowArrayStream -/// -/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). -/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() -/// to validate the contents of the arrays. -ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, - struct ArrowError* error); - -/// @} - -// Inline function definitions - - - -#ifdef __cplusplus -} -#endif - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED -#define NANOARROW_BUFFER_INLINE_H_INCLUDED - -#include -#include -#include - - - -#ifdef __cplusplus -extern "C" { -#endif - -static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { - int64_t doubled_capacity = current_capacity * 2; - if (doubled_capacity > new_capacity) { - return doubled_capacity; - } else { - return new_capacity; - } -} - -static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { - buffer->data = NULL; - buffer->size_bytes = 0; - buffer->capacity_bytes = 0; - buffer->allocator = ArrowBufferAllocatorDefault(); -} - -static inline ArrowErrorCode ArrowBufferSetAllocator( - struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { - if (buffer->data == NULL) { - buffer->allocator = allocator; - return NANOARROW_OK; - } else { - return EINVAL; - } -} - -static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { - if (buffer->data != NULL) { - buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, - buffer->capacity_bytes); - buffer->data = NULL; - } - - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; -} - -static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { - memcpy(dst, src, sizeof(struct ArrowBuffer)); - src->data = NULL; - ArrowBufferReset(src); -} - -static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, - char shrink_to_fit) { - if (new_capacity_bytes < 0) { - return EINVAL; - } - - if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { - buffer->data = buffer->allocator.reallocate( - &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); - if (buffer->data == NULL && new_capacity_bytes > 0) { - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; - return ENOMEM; - } - - buffer->capacity_bytes = new_capacity_bytes; - } - - // Ensures that when shrinking that size <= capacity - if (new_capacity_bytes < buffer->size_bytes) { - buffer->size_bytes = new_capacity_bytes; - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, - int64_t additional_size_bytes) { - int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; - if (min_capacity_bytes <= buffer->capacity_bytes) { - return NANOARROW_OK; - } - - return ArrowBufferResize( - buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); -} - -static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, - int64_t size_bytes) { - if (size_bytes > 0) { - memcpy(buffer->data + buffer->size_bytes, data, size_bytes); - buffer->size_bytes += size_bytes; - } -} - -static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, - const void* data, int64_t size_bytes) { - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); - - ArrowBufferAppendUnsafe(buffer, data, size_bytes); - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, - int8_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, - uint8_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, - int16_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, - uint16_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, - int32_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, - uint32_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, - int64_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, - uint64_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, - double value) { - return ArrowBufferAppend(buffer, &value, sizeof(double)); -} - -static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, - float value) { - return ArrowBufferAppend(buffer, &value, sizeof(float)); -} - -static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, - struct ArrowStringView value) { - return ArrowBufferAppend(buffer, value.data, value.size_bytes); -} - -static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, - struct ArrowBufferView value) { - return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); -} - -static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, - uint8_t value, int64_t size_bytes) { - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); - - memset(buffer->data + buffer->size_bytes, value, size_bytes); - buffer->size_bytes += size_bytes; - return NANOARROW_OK; -} - -static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; -static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; -static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; -static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; - -static const uint8_t _ArrowkBytePopcount[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, - 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, - 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, - 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, - 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, - 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, - 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, - 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; - -static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { - return (value + 7) & ~((int64_t)7); -} - -static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { - return (value / 8) * 8; -} - -static inline int64_t _ArrowBytesForBits(int64_t bits) { - return (bits >> 3) + ((bits & 7) != 0); -} - -static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { - *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | - values[5] << 5 | values[6] << 6 | values[7] << 7); -} - -static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { - *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | - values[5] << 5 | values[6] << 6 | values[7] << 7); -} - -static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { - return (bits[i >> 3] >> (i & 0x07)) & 1; -} - -static inline void ArrowBitSet(uint8_t* bits, int64_t i) { - bits[i / 8] |= _ArrowkBitmask[i % 8]; -} - -static inline void ArrowBitClear(uint8_t* bits, int64_t i) { - bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; -} - -static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { - bits[i / 8] ^= - ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; -} - -static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, - uint8_t bits_are_set) { - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const uint8_t fill_byte = (uint8_t)(-bits_are_set); - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_end = i_end / 8 + 1; - - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; - - if (bytes_end == bytes_begin + 1) { - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); - bits[bytes_begin] &= only_byte_mask; - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte - bits[bytes_begin] &= first_byte_mask; - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { - // set/clear whole bytes - memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); - } - - if (i_end % 8 == 0) { - return; - } - - // set/clear leading bits of last byte - bits[bytes_end - 1] &= last_byte_mask; - bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); -} - -static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, - int64_t length) { - if (length == 0) { - return 0; - } - - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_end = i_end / 8 + 1; - - if (bytes_end == bytes_begin + 1) { - // count bits within a single byte - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; - - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); - - const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; - return _ArrowkBytePopcount[byte_masked]; - } - - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; - int64_t count = 0; - - // first byte - count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; - - // middle bytes - for (int64_t i = bytes_begin + 1; i < (bytes_end - 1); i++) { - count += _ArrowkBytePopcount[bits[i]]; - } - - // last byte - count += _ArrowkBytePopcount[bits[bytes_end - 1] & ~last_byte_mask]; - - return count; -} - -static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { - ArrowBufferInit(&bitmap->buffer); - bitmap->size_bits = 0; -} - -static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { - ArrowBufferMove(&src->buffer, &dst->buffer); - dst->size_bits = src->size_bits; - src->size_bits = 0; -} - -static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, - int64_t additional_size_bits) { - int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; - if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { - return NANOARROW_OK; - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); - - bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit) { - if (new_capacity_bits < 0) { - return EINVAL; - } - - int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); - NANOARROW_RETURN_NOT_OK( - ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); - - if (new_capacity_bits < bitmap->size_bits) { - bitmap->size_bits = new_capacity_bits; - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); - - ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); - return NANOARROW_OK; -} - -static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length) { - ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); - bitmap->size_bits += length; - bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); -} - -static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - const int8_t* values, int64_t n_values) { - if (n_values == 0) { - return; - } - - const int8_t* values_cursor = values; - int64_t n_remaining = n_values; - int64_t out_i_cursor = bitmap->size_bits; - uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; - - // First byte - if ((out_i_cursor % 8) != 0) { - int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; - for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); - } - - out_cursor++; - values_cursor += n_partial_bits; - n_remaining -= n_partial_bits; - } - - // Middle bytes - int64_t n_full_bytes = n_remaining / 8; - for (int64_t i = 0; i < n_full_bytes; i++) { - _ArrowBitmapPackInt8(values_cursor, out_cursor); - values_cursor += 8; - out_cursor++; - } - - // Last byte - out_i_cursor += n_full_bytes * 8; - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte - *out_cursor = 0x00; - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } - out_cursor++; - } - - bitmap->size_bits += n_values; - bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; -} - -static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, - const int32_t* values, int64_t n_values) { - if (n_values == 0) { - return; - } - - const int32_t* values_cursor = values; - int64_t n_remaining = n_values; - int64_t out_i_cursor = bitmap->size_bits; - uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; - - // First byte - if ((out_i_cursor % 8) != 0) { - int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; - for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); - } - - out_cursor++; - values_cursor += n_partial_bits; - n_remaining -= n_partial_bits; - } - - // Middle bytes - int64_t n_full_bytes = n_remaining / 8; - for (int64_t i = 0; i < n_full_bytes; i++) { - _ArrowBitmapPackInt32(values_cursor, out_cursor); - values_cursor += 8; - out_cursor++; - } - - // Last byte - out_i_cursor += n_full_bytes * 8; - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte - *out_cursor = 0x00; - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } - out_cursor++; - } - - bitmap->size_bits += n_values; - bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; -} - -static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { - ArrowBufferReset(&bitmap->buffer); - bitmap->size_bits = 0; -} - -#ifdef __cplusplus -} -#endif - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED -#define NANOARROW_ARRAY_INLINE_H_INCLUDED - -#include -#include -#include -#include -#include - - - - -#ifdef __cplusplus -extern "C" { -#endif - -static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - return &private_data->bitmap; -} - -static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - switch (i) { - case 0: - return &private_data->bitmap.buffer; - default: - return private_data->buffers + i - 1; - } -} - -// We don't currently support the case of unions where type_id != child_index; -// however, these functions are used to keep track of where that assumption -// is made. -static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, - int8_t type_id) { - return type_id; -} - -static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, - int8_t child_index) { - return child_index; -} - -static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { - if (*type_ids == '\0') { - return 0; - } - - int32_t i = 0; - long type_id; - char* end_ptr; - do { - type_id = strtol(type_ids, &end_ptr, 10); - if (end_ptr == type_ids || type_id < 0 || type_id > 127) { - return -1; - } - - if (out != NULL) { - out[i] = type_id; - } - - i++; - - type_ids = end_ptr; - if (*type_ids == '\0') { - return i; - } else if (*type_ids != ',') { - return -1; - } else { - type_ids++; - } - } while (1); - - return -1; -} - -static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, - int64_t n_type_ids, - int64_t n_children) { - if (n_type_ids != n_children) { - return 0; - } - - for (int8_t i = 0; i < n_type_ids; i++) { - if (type_ids[i] != i) { - return 0; - } - } - - return 1; -} - -static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, - int64_t n_children) { - int8_t type_ids[128]; - int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); - return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); -} - -static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_UNINITIALIZED: - return EINVAL; - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_DENSE_UNION: - // Note that this value could be -1 if the type_ids string was invalid - if (private_data->union_type_id_is_child_index != 1) { - return EINVAL; - } else { - break; - } - default: - break; - } - if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { - return EINVAL; - } - - // Initialize any data offset buffer with a single zero - for (int i = 0; i < 3; i++) { - if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && - private_data->layout.element_size_bits[i] == 64) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); - } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && - private_data->layout.element_size_bits[i] == 32) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); - } - } - - // Start building any child arrays - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { - for (int64_t i = 0; i < 3; i++) { - struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); - NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); - } - - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, - int64_t buffer_i, uint8_t value, - int64_t n) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); - int64_t bytes_required = - _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * - (array->length + 1)) / - 8; - if (bytes_required > buffer->size_bytes) { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); - } - - ArrowBitsSetTo(buffer->data, array->length, n, value); - return NANOARROW_OK; -} - -static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, - int64_t n, uint8_t is_valid) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - if (n == 0) { - return NANOARROW_OK; - } - - // Some type-specific handling - switch (private_data->storage_type) { - case NANOARROW_TYPE_NA: - // (An empty value for a null array *is* a null) - array->null_count += n; - array->length += n; - return NANOARROW_OK; - - case NANOARROW_TYPE_DENSE_UNION: { - // Add one null to the first child and append n references to that child - int8_t type_id = _ArrowArrayUnionTypeId(array, 0); - NANOARROW_RETURN_NOT_OK( - _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); - for (int64_t i = 0; i < n; i++) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( - ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); - } - // For the purposes of array->null_count, union elements are never considered "null" - // even if some children contain nulls. - array->length += n; - return NANOARROW_OK; - } - - case NANOARROW_TYPE_SPARSE_UNION: { - // Add n nulls to the first child and append n references to that child - int8_t type_id = _ArrowArrayUnionTypeId(array, 0); - NANOARROW_RETURN_NOT_OK( - _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); - for (int64_t i = 1; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); - // For the purposes of array->null_count, union elements are never considered "null" - // even if some children contain nulls. - array->length += n; - return NANOARROW_OK; - } - - case NANOARROW_TYPE_FIXED_SIZE_LIST: - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( - array->children[0], n * private_data->layout.child_size_elements)); - break; - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); - } - break; - - default: - break; - } - - // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet - // and we need to append nulls, do it now. - if (!is_valid && private_data->bitmap.buffer.data == NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); - ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); - ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); - } else if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); - ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); - } - - // Add appropriate buffer fill - struct ArrowBuffer* buffer; - int64_t size_bytes; - - for (int i = 0; i < 3; i++) { - buffer = ArrowArrayBuffer(array, i); - size_bytes = private_data->layout.element_size_bits[i] / 8; - - switch (private_data->layout.buffer_type[i]) { - case NANOARROW_BUFFER_TYPE_NONE: - case NANOARROW_BUFFER_TYPE_VALIDITY: - continue; - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - // Append the current value at the end of the offset buffer for each element - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); - - for (int64_t j = 0; j < n; j++) { - ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), - size_bytes); - } - - // Skip the data buffer - i++; - continue; - case NANOARROW_BUFFER_TYPE_DATA: - // Zero out the next bit of memory - if (private_data->layout.element_size_bits[i] % 8 == 0) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); - } else { - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); - } - continue; - - case NANOARROW_BUFFER_TYPE_TYPE_ID: - case NANOARROW_BUFFER_TYPE_UNION_OFFSET: - // These cases return above - return EINVAL; - } - } - - array->length += n; - array->null_count += n * !is_valid; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { - return _ArrowArrayAppendEmptyInternal(array, n, 0); -} - -static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { - return _ArrowArrayAppendEmptyInternal(array, n, 1); -} - -static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, - int64_t value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_INT64: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); - break; - case NANOARROW_TYPE_INT32: - _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); - break; - case NANOARROW_TYPE_INT16: - _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); - break; - case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); - break; - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); - return ArrowArrayAppendUInt(array, value); - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, value)); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, value)); - break; - case NANOARROW_TYPE_BOOL: - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, - uint64_t value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_UINT64: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); - break; - case NANOARROW_TYPE_UINT32: - _NANOARROW_CHECK_RANGE(value, 0, UINT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); - break; - case NANOARROW_TYPE_UINT16: - _NANOARROW_CHECK_RANGE(value, 0, UINT16_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); - break; - case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_RANGE(value, 0, UINT8_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); - break; - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); - return ArrowArrayAppendInt(array, value); - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, value)); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, value)); - break; - case NANOARROW_TYPE_BOOL: - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, - double value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, - struct ArrowBufferView value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); - struct ArrowBuffer* data_buffer = ArrowArrayBuffer( - array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); - int32_t offset; - int64_t large_offset; - int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - offset = ((int32_t*)offset_buffer->data)[array->length]; - if ((offset + value.size_bytes) > INT32_MAX) { - return EINVAL; - } - - offset += value.size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - large_offset = ((int64_t*)offset_buffer->data)[array->length]; - large_offset += value.size_bytes; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - if (value.size_bytes != fixed_size_bytes) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, - struct ArrowStringView value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBufferView buffer_view; - buffer_view.data.data = value.data; - buffer_view.size_bytes = value.size_bytes; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - return ArrowArrayAppendBytes(array, buffer_view); - default: - return EINVAL; - } -} - -static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - struct ArrowDecimal* value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DECIMAL128: - if (value->n_words != 2) { - return EINVAL; - } else { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); - break; - } - case NANOARROW_TYPE_DECIMAL256: - if (value->n_words != 4) { - return EINVAL; - } else { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); - break; - } - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - int64_t child_length; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: - child_length = array->children[0]->length; - if (child_length > INT32_MAX) { - return EINVAL; - } - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); - break; - case NANOARROW_TYPE_LARGE_LIST: - child_length = array->children[0]->length; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); - break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - child_length = array->children[0]->length; - if (child_length != - ((array->length + 1) * private_data->layout.child_size_elements)) { - return EINVAL; - } - break; - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array->n_children; i++) { - child_length = array->children[i]->length; - if (child_length != (array->length + 1)) { - return EINVAL; - } - } - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, - int8_t type_id) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); - if (child_index < 0 || child_index >= array->n_children) { - return EINVAL; - } - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - // Apppend the target child length to the union offsets buffer - _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( - ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); - break; - case NANOARROW_TYPE_SPARSE_UNION: - // Append one empty to any non-target column that isn't already the right length - // or abort if appending a null will result in a column with invalid length - for (int64_t i = 0; i < array->n_children; i++) { - if (i == child_index || array->children[i]->length == (array->length + 1)) { - continue; - } - - if (array->children[i]->length != array->length) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); - } - - break; - default: - return EINVAL; - } - - // Write to the type_ids buffer - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); - array->length++; - return NANOARROW_OK; -} - -static inline void ArrowArrayViewMove(struct ArrowArrayView* src, - struct ArrowArrayView* dst) { - memcpy(dst, src, sizeof(struct ArrowArrayView)); - ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); -} - -static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i) { - const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; - i += array_view->array->offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_NA: - return 0x01; - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - // Unions are "never null" in Arrow land - return 0x00; - default: - return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); - } -} - -static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, - int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - return array_view->buffer_views[0].data.as_int8[i]; - default: - return -1; - } -} - -static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, - int64_t i) { - int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); - if (array_view->union_type_id_map == NULL) { - return type_id; - } else { - return array_view->union_type_id_map[type_id]; - } -} - -static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, - int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - return array_view->buffer_views[1].data.as_int32[i]; - case NANOARROW_TYPE_SPARSE_UNION: - return i; - default: - return -1; - } -} - -static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, - int64_t i) { - struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - i += array_view->array->offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return INT64_MAX; - } -} - -static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, - int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return UINT64_MAX; - } -} - -static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, - int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return DBL_MAX; - } -} - -static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( - struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; - const char* data_view = array_view->buffer_views[2].data.as_char; - - struct ArrowStringView view; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - view.data = data_view + offsets_view->data.as_int32[i]; - view.size_bytes = - offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; - break; - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - view.data = data_view + offsets_view->data.as_int64[i]; - view.size_bytes = - offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - view.size_bytes = array_view->layout.element_size_bits[1] / 8; - view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); - break; - default: - view.data = NULL; - view.size_bytes = 0; - break; - } - - return view; -} - -static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( - struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; - const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; - - struct ArrowBufferView view; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - view.size_bytes = - offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; - view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; - break; - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - view.size_bytes = - offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; - view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - view.size_bytes = array_view->layout.element_size_bits[1] / 8; - view.data.as_uint8 = - array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); - break; - default: - view.data.data = NULL; - view.size_bytes = 0; - break; - } - - return view; -} - -static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, - int64_t i, struct ArrowDecimal* out) { - i += array_view->array->offset; - const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; - switch (array_view->storage_type) { - case NANOARROW_TYPE_DECIMAL128: - ArrowDecimalSetBytes(out, data_view + (i * 16)); - break; - case NANOARROW_TYPE_DECIMAL256: - ArrowDecimalSetBytes(out, data_view + (i * 32)); - break; - default: - memset(out->words, 0, sizeof(out->words)); - break; - } -} - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 4a312bb4c3..4c58f7012b 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -1,6 +1,7 @@ -#include // for R interface to C++ -#include // for C interface to Arrow -#include // for fromInteger64 +#include // for R interface to C++ +#include // for C interface to Arrow (via R package) +#include // for C/C++ interface to Arrow +#include // for fromInteger64 // we currently get deprecation warnings by default which are noisy #ifndef TILEDB_NO_API_DEPRECATION_WARNINGS @@ -48,15 +49,15 @@ Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t //' @noRd // [[Rcpp::export(soma_array_reader_impl)]] -Rcpp::List soma_array_reader(const std::string& uri, - Rcpp::Nullable colnames = R_NilValue, - Rcpp::Nullable> qc = R_NilValue, - Rcpp::Nullable dim_points = R_NilValue, - Rcpp::Nullable dim_ranges = R_NilValue, - std::string batch_size = "auto", - std::string result_order = "auto", - const std::string& loglevel = "auto", - Rcpp::Nullable config = R_NilValue) { +SEXP soma_array_reader(const std::string& uri, + Rcpp::Nullable colnames = R_NilValue, + Rcpp::Nullable> qc = R_NilValue, + Rcpp::Nullable dim_points = R_NilValue, + Rcpp::Nullable dim_ranges = R_NilValue, + std::string batch_size = "auto", + std::string result_order = "auto", + const std::string& loglevel = "auto", + Rcpp::Nullable config = R_NilValue) { if (loglevel != "auto") { spdl::set_level(loglevel); @@ -79,7 +80,7 @@ Rcpp::List soma_array_reader(const std::string& uri, // Read selected columns from the uri (return is unique_ptr) auto sr = tdbs::SOMAArray::open(OpenMode::read, - uri, + uri, "unnamed", // name parameter could be added platform_config, column_names, @@ -130,17 +131,22 @@ Rcpp::List soma_array_reader(const std::string& uri, const std::vector names = sr_data->get()->names(); auto ncol = names.size(); - Rcpp::XPtr schemaxp = schema_owning_xptr(); - Rcpp::XPtr arrayxp = array_owning_xptr(); - schemaxp = schema_setup_struct(schemaxp, ncol); - arrayxp = array_setup_struct(arrayxp, ncol); - arrayxp->length = 0; + // Schema first + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); - for (size_t i=0; i chldschemaxp = schema_owning_xptr(); - Rcpp::XPtr chldarrayxp = array_owning_xptr(); + // Array second + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + + arr->length = 0; // initial value + for (size_t i=0; ichildren[i], pp.second.get(), sizeof(ArrowSchema)); + //memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); + ArrowArrayMove(pp.first.get(), arr->children[i]); + ArrowSchemaMove(pp.second.get(), sch->children[i]); spdl::info("[soma_array_reader] Incoming name {} length {}", std::string(pp.second->name), pp.first->length); - schemaxp->children[i] = chldschemaxp; - arrayxp->children[i] = chldarrayxp; - - // if (buf->has_enumeration()) { - // auto vec = buf->get_enumeration(); - // Rcpp::Rcout << names[i] << ": "; - // for (auto& s: vec) { - // Rcpp::Rcout << s << " "; - // } - // Rcpp::Rcout << std::endl; - // } - - if (pp.first->length > arrayxp->length) { + if (pp.first->length > arr->length) { spdl::debug("[soma_array_reader] Setting array length to {}", pp.first->length); - arrayxp->length = pp.first->length; + arr->length = pp.first->length; } } - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayxp, - Rcpp::Named("schema") = schemaxp); - return as; + // Nanoarrow special: stick schema into xptr tag to return single SEXP + array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + return arrayxp; } //' Set the logging level for the R package and underlying C++ library diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index 1cd170aacc..7521feb56a 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -3,15 +3,18 @@ #define TILEDB_NO_API_DEPRECATION_WARNINGS #endif -#include // for R interface to C++ -#include // for C interface to Arrow +//#define RCPP_DEBUG_LEVEL 5 + +#include // for R interface to C++ +#include // for C interface to Arrow (via R package nanoarrow) +#include #include #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 4 #include #endif -// We get these via nanoarrow and must cannot include carrow.h again +// We get these via nanoarrow and must not include carrow.h again #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 #include @@ -79,8 +82,6 @@ Rcpp::List sr_setup(const std::string& uri, Rcpp::Nullable timestamp_end = R_NilValue, const std::string& loglevel = "auto") { - //Rcpp::XPtr sr_setup(const std::string& uri, - if (loglevel != "auto") { spdl::set_level(loglevel); tdbs::LOG_SET_LEVEL(loglevel); @@ -98,7 +99,7 @@ Rcpp::List sr_setup(const std::string& uri, std::shared_ptr ctxptr = std::make_shared(cfg); ctx_wrap_t* ctxwrap_p = new ContextWrapper(ctxptr); - Rcpp::XPtr ctx_wrap_xptr = make_xptr(ctxwrap_p); + Rcpp::XPtr ctx_wrap_xptr = make_xptr(ctxwrap_p, false); if (!colnames.isNull()) { column_names = Rcpp::as>(colnames); @@ -164,20 +165,35 @@ bool sr_complete(Rcpp::XPtr sr) { return res; } -Rcpp::List create_empty_arrow_table() { - Rcpp::XPtr schemaxp = schema_owning_xptr(); - Rcpp::XPtr arrayxp = array_owning_xptr(); - schemaxp = schema_setup_struct(schemaxp, 0); - arrayxp = array_setup_struct(arrayxp, 0); - arrayxp->length = 0; - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayxp, - Rcpp::Named("schema") = schemaxp); - return as; +//' @noRd +//' @import nanoarrow +// [[Rcpp::export]] +SEXP create_empty_arrow_table() { + int ncol = 0; + + // Schema first + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); + + // Array second + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + arr->length = 0; + + // Nanoarrow special: stick schema into xptr tag to return single SEXP + array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + + return arrayxp; } // [[Rcpp::export]] -Rcpp::List sr_next(Rcpp::XPtr sr) { +SEXP sr_next(Rcpp::XPtr sr) { check_xptr_tag(sr); if (sr_complete(sr)) { @@ -198,17 +214,22 @@ Rcpp::List sr_next(Rcpp::XPtr sr) { const std::vector names = sr_data->get()->names(); auto ncol = names.size(); - Rcpp::XPtr schemaxp = schema_owning_xptr(); - Rcpp::XPtr arrayxp = array_owning_xptr(); - schemaxp = schema_setup_struct(schemaxp, ncol); - arrayxp = array_setup_struct(arrayxp, ncol); - arrayxp->length = 0; + // Schema first + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); - for (size_t i=0; i chldschemaxp = schema_owning_xptr(); - Rcpp::XPtr chldarrayxp = array_owning_xptr(); + // Array second + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + arr->length = 0; // initial value + + for (size_t i=0; i sr) { // this is pair of array and schema pointer auto pp = tdbs::ArrowAdapter::to_arrow(buf); - memcpy((void*) chldschemaxp, pp.second.get(), sizeof(ArrowSchema)); - memcpy((void*) chldarrayxp, pp.first.get(), sizeof(ArrowArray)); + ArrowArrayMove(pp.first.get(), arr->children[i]); + ArrowSchemaMove(pp.second.get(), sch->children[i]); - schemaxp->children[i] = chldschemaxp; - arrayxp->children[i] = chldarrayxp; - - if (pp.first->length > arrayxp->length) { + if (pp.first->length > arr->length) { spdl::debug("[soma_array_reader] Setting array length to {}", pp.first->length); - arrayxp->length = pp.first->length; + arr->length = pp.first->length; } - } - spdl::debug("[sr_next] Exporting chunk with {} rows", arrayxp->length); - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayxp, - Rcpp::Named("schema") = schemaxp); - return as; + spdl::debug("[sr_next] Exporting chunk with {} rows", arr->length); + // Nanoarrow special: stick schema into xptr tag to return single SEXP + array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + return arrayxp; } diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp index 2430dd8d81..e986eef836 100644 --- a/apis/r/src/rutilities.cpp +++ b/apis/r/src/rutilities.cpp @@ -4,9 +4,9 @@ #define TILEDB_NO_API_DEPRECATION_WARNINGS #endif -#include // for R interface to C++ -#include // for C interface to Arrow -#include // for fromInteger64 +#include // for R interface to C++ +#include // for C interface to Arrow +#include // for fromInteger64 // We get these via nanoarrow and must cannot include carrow.h again #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 @@ -205,7 +205,7 @@ Rcpp::XPtr schema_setup_struct(Rcpp::XPtr schxp, int64 } extern "C" { - void ArrowArrayRelease(struct ArrowArray *array); // made non-static in nanoarrow.c + void ArrowArrayReleaseInternal(struct ArrowArray *array); // non-static in nanoarrow.c ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, // ditto enum ArrowType storage_type); } @@ -222,7 +222,7 @@ Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t array->buffers = NULL; array->children = NULL; array->dictionary = NULL; - array->release = &ArrowArrayRelease; + array->release = &ArrowArrayReleaseInternal; array->private_data = NULL; auto private_data = (struct ArrowArrayPrivateData*) ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); diff --git a/apis/r/src/rutilities.h b/apis/r/src/rutilities.h index 3669252ae9..f40ee755af 100644 --- a/apis/r/src/rutilities.h +++ b/apis/r/src/rutilities.h @@ -62,3 +62,13 @@ struct ContextWrapper { std::shared_ptr ctxptr; }; typedef struct ContextWrapper ctx_wrap_t; + +inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { + if (ec != NANOARROW_OK) Rcpp::stop(msg); +} + +// Attaches a schema to an array external pointer. The nanoarrow R package +// attempts to do this whenever possible to avoid misinterpreting arrays. +inline void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) { + R_SetExternalPtrTag(array_xptr, schema_xptr); +} diff --git a/apis/r/tests/testthat/helper-test-data.R b/apis/r/tests/testthat/helper-test-data.R index 7a51dd14e1..c690bd0e48 100644 --- a/apis/r/tests/testthat/helper-test-data.R +++ b/apis/r/tests/testthat/helper-test-data.R @@ -39,19 +39,20 @@ create_dense_matrix_with_int_dims <- function(nrows = 10, ncols = 5, seed = 1) { } create_arrow_schema <- function(foo_first = TRUE) { + bl <- FALSE if (foo_first) { arrow::schema( - arrow::field("foo", arrow::int32(), nullable = FALSE), - arrow::field("soma_joinid", arrow::int64(), nullable = FALSE), - arrow::field("bar", arrow::float64(), nullable = FALSE), - arrow::field("baz", arrow::large_utf8(), nullable = FALSE) + arrow::field("foo", arrow::int32(), nullable = bl), + arrow::field("soma_joinid", arrow::int64(), nullable = bl), + arrow::field("bar", arrow::float64(), nullable = bl), + arrow::field("baz", arrow::large_utf8(), nullable = bl) ) } else { arrow::schema( - arrow::field("soma_joinid", arrow::int64(), nullable = FALSE), - arrow::field("foo", arrow::int32(), nullable = FALSE), - arrow::field("bar", arrow::float64(), nullable = FALSE), - arrow::field("baz", arrow::large_utf8(), nullable = FALSE) + arrow::field("soma_joinid", arrow::int64(), nullable = bl), + arrow::field("foo", arrow::int32(), nullable = bl), + arrow::field("bar", arrow::float64(), nullable = bl), + arrow::field("baz", arrow::large_utf8(), nullable = bl) ) } } diff --git a/apis/r/tests/testthat/test-Factory.R b/apis/r/tests/testthat/test-Factory.R index 019b486d68..59e1896310 100644 --- a/apis/r/tests/testthat/test-Factory.R +++ b/apis/r/tests/testthat/test-Factory.R @@ -16,7 +16,7 @@ test_that("DataFrame Factory", { # Check opening to read expect_silent(d3 <- SOMADataFrameOpen(uri)) expect_silent(chk <- d3$read()$concat()) - expect_equal(tbl, chk) + expect_equal(tibble::as_tibble(tbl), tibble::as_tibble(chk)) }) test_that("DataFrame Factory with specified index_column_names", { @@ -35,7 +35,7 @@ test_that("DataFrame Factory with specified index_column_names", { expect_silent(d3 <- SOMADataFrameOpen(uri)) expect_equal(d3$mode(), "READ") expect_silent(chk <- d3$read()$concat()) - expect_equal(tbl, chk) + expect_equal(tibble::as_tibble(tbl), tibble::as_tibble(chk)) d3$close() expect_equal(d3$mode(), "CLOSED") }) diff --git a/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R b/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R index 4f6509d883..6f19409a40 100644 --- a/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R +++ b/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R @@ -29,9 +29,8 @@ test_that("Iterated Interface from SOMAArrayReader", { expect_true(is.data.frame(rl)) expect_equal(nrow(rl), 4848644) expect_equal(ncol(rl), 3) - rm(sr) - #gc() + gc() srret <- sr_setup(uri, config=as.character(config), dim_points=list(soma_dim_0=as.integer64(1))) sr <- srret$sr @@ -50,7 +49,7 @@ test_that("Iterated Interface from SOMAArrayReader", { expect_equal(ncol(rl), 3) rm(sr) - #gc() + gc() srret <- sr_setup(uri, config=as.character(config), dim_range=list(soma_dim_1=cbind(as.integer64(1),as.integer64(2)))) sr <- srret$sr @@ -195,6 +194,7 @@ test_that("Iterated Interface from SOMA Sparse Matrix", { test_that("Dimension Point and Ranges Bounds", { skip_if(!extended_tests() || covr_tests()) ctx <- tiledbsoma::SOMATileDBContext$new() + config <- as.character(tiledb::config(ctx$context())) human_experiment <- load_dataset("soma-exp-pbmc-small", tiledbsoma_ctx = ctx) X <- human_experiment$ms$get("RNA")$X$get("data") @@ -207,7 +207,7 @@ test_that("Dimension Point and Ranges Bounds", { sr <- srret$sr chunk <- sr_next(sr) - at <- arrow::as_arrow_table(arrow::RecordBatch$import_from_c(chunk$array_data, chunk$schema)) + at <- arrow::as_arrow_table(chunk) expect_equal(at$num_rows, 5) expect_equal(at$num_columns, 3) rm(sr) @@ -220,7 +220,7 @@ test_that("Dimension Point and Ranges Bounds", { sr <- srret$sr chunk <- sr_next(sr) - at <- arrow::as_arrow_table(arrow::RecordBatch$import_from_c(chunk$array_data, chunk$schema)) + at <- arrow::as_arrow_table(chunk) expect_equal(at$num_rows, 2) expect_equal(at$num_columns, 3) diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index 61b9ebace1..5984b7887d 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -783,7 +783,7 @@ test_that("missing levels in enums", { test_that("factor levels can grow without overlap", { - + skip_if(!extended_tests()) uri <- tempfile() schema <- arrow::schema(arrow::field(name = "soma_joinid", type = arrow::int64()), arrow::field(name = "obs_col_like", @@ -822,6 +822,7 @@ test_that("factor levels can grow without overlap", { }) test_that("factor levels cannot extend beyond index limit", { + skip_if(!extended_tests()) for (tp in c("INT8", "UINT8")) { uri <- tempfile() idx_type <- if (tp == "INT8") arrow::int8() else arrow::uint8() diff --git a/apis/r/tools/r-ci.sh b/apis/r/tools/r-ci.sh index 866ce4031a..b8e9ad4ca2 100755 --- a/apis/r/tools/r-ci.sh +++ b/apis/r/tools/r-ci.sh @@ -354,12 +354,12 @@ InstallDeps() { } InstallDepsAndSuggests() { - sudo Rscript -e 'remotes::install_deps(".", dependencies=TRUE)' + sudo Rscript -e 'options(timeout = max(300, getOption("timeout"))); remotes::install_deps(".", dependencies=TRUE)' } DumpSysinfo() { echo "Dumping system information." - R -e '.libPaths(); sessionInfo(); installed.packages()' + Rscript -e '.libPaths(); sessionInfo(); installed.packages()' } DumpLogsByExtension() { diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index 4205da0319..310549a806 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -69,6 +69,7 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.c ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/thread_pool.cc ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/status.cc @@ -173,7 +174,7 @@ if(SANITIZER) endif() endif() -# Install header files +# Install header files # target_sources FILE_SET is preferred with cmake>=3.23 # TODO Uncomment after finishing Python and R bindings # set(TILEDB_SOMA_PUBLIC_HEADERS @@ -188,21 +189,21 @@ endif() # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/logger_public.h # ) -install(FILES +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/soma/enums.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/logger_public.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_context.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_context.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/managed_query.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/array_buffers.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/array_buffers.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_group.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dense_ndarray.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.h DESTINATION "include/tiledbsoma/soma" ) @@ -213,7 +214,8 @@ install(FILES ) install(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/utils/carrow.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.hpp DESTINATION "include/tiledbsoma/utils/" ) @@ -223,11 +225,10 @@ install(FILES ) install(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/carrow.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/common.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/common.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.h DESTINATION "include/tiledbsoma/utils" diff --git a/libtiledbsoma/src/cli/cli.cc b/libtiledbsoma/src/cli/cli.cc index b0eabc788c..4f698ed3a8 100644 --- a/libtiledbsoma/src/cli/cli.cc +++ b/libtiledbsoma/src/cli/cli.cc @@ -33,7 +33,6 @@ #include "soma/enums.h" #include "soma/soma_array.h" #include "utils/arrow_adapter.h" -#include "utils/carrow.h" #include "utils/logger.h" using namespace tiledbsoma; diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4f52c0a9be..66a87d863e 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2022-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,66 +39,112 @@ namespace tiledbsoma { using namespace tiledb; void ArrowAdapter::release_schema(struct ArrowSchema* schema) { - schema->release = nullptr; + if (schema->name != nullptr) + LOG_DEBUG( + fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); + + if (schema->name != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->name"); + free((void*)schema->name); + schema->name = nullptr; + } + if (schema->format != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->format"); + free((void*)schema->format); + schema->format = nullptr; + } + if (schema->metadata != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->metadata"); + free((void*)schema->metadata); + schema->metadata = nullptr; + } - for (int i = 0; i < schema->n_children; ++i) { - struct ArrowSchema* child = schema->children[i]; - if (schema->name != nullptr) { - free((void*)schema->name); - schema->name = nullptr; - } - if (child->release != NULL) { - child->release(child); + if (schema->children != nullptr) { + for (auto i = 0; i < schema->n_children; i++) { + if (schema->children[i] != nullptr) { + if (schema->children[i]->release != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} " + "release", + i)); + release_schema(schema->children[i]); + } + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} free", i)); + free(schema->children[i]); + } } - free(child); + LOG_TRACE("[ArrowAdapter] release_schema schema->children"); + free(schema->children); + schema->children = nullptr; } - free(schema->children); - struct ArrowSchema* dict = schema->dictionary; - if (dict != nullptr) { - if (dict->format != nullptr) { - free((void*)dict->format); - dict->format = nullptr; - } - if (dict->release != nullptr) { - delete dict; - dict = nullptr; + if (schema->dictionary != nullptr) { + if (schema->dictionary->release != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->dict release"); + release_schema(schema->dictionary); } + LOG_TRACE("[ArrowAdapter] release_schema schema->dict free"); + free(schema->dictionary); + schema->dictionary = nullptr; } - LOG_TRACE("[ArrowAdapter] release_schema"); + schema->release = nullptr; + LOG_TRACE("[ArrowAdapter] release_schema done"); } void ArrowAdapter::release_array(struct ArrowArray* array) { auto arrow_buffer = static_cast(array->private_data); - - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_array {} use_count={}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count())); - - // Delete the ArrowBuffer, which was allocated with new. - // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the - // underlying ColumnBuffer, the ColumnBuffer will be deleted. - delete arrow_buffer; + if (arrow_buffer != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_array {} use_count={}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count())); + + // Delete the ArrowBuffer, which was allocated with new. + // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the + // underlying ColumnBuffer, the ColumnBuffer will be deleted. + delete arrow_buffer; + } if (array->buffers != nullptr) { - delete[] array->buffers; + free(array->buffers); + array->buffers = nullptr; } - struct ArrowArray* dict = array->dictionary; - if (dict != nullptr) { - if (dict->buffers != nullptr) { - free(dict->buffers); - dict->buffers = nullptr; - } - if (dict->release != nullptr) { - delete dict; - dict = nullptr; + if (array->children != nullptr) { + for (auto i = 0; i < array->n_children; i++) { + if (array->children[i] != nullptr) { + if (array->children[i]->release != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} release", + i)); + release_array(array->children[i]); + } + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} free", i)); + free(array->children[i]); + } } + LOG_TRACE("[ArrowAdapter] release_array array->children"); + free(array->children); + array->children = nullptr; + } + + if (array->dictionary != nullptr) { + // TODO: This can lead to segfault on some data sets, could be caused + // by how we fill arrow data structures. This should pass. + // if (array->dictionary->release != nullptr) { + // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); + // release_array(array->dictionary); + //} + LOG_TRACE("[ArrowAdapter] release_array array->dict free"); + free(array->dictionary); + array->dictionary = nullptr; } array->release = nullptr; + LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); } std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( @@ -108,17 +154,20 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( auto nattr = tiledb_schema.attribute_num(); std::unique_ptr arrow_schema = std::make_unique(); - arrow_schema->format = "+s"; + arrow_schema->format = strdup("+s"); arrow_schema->n_children = ndim + nattr; arrow_schema->release = &ArrowAdapter::release_schema; - arrow_schema->children = new ArrowSchema*[arrow_schema->n_children]; + arrow_schema->children = (ArrowSchema**)malloc( + arrow_schema->n_children * sizeof(ArrowSchema*)); ArrowSchema* child = nullptr; for (uint32_t i = 0; i < ndim; ++i) { auto dim = tiledb_schema.domain().dimension(i); - child = arrow_schema->children[i] = new ArrowSchema; - child->format = ArrowAdapter::to_arrow_format(dim.type()).data(); + child = arrow_schema->children[i] = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); + child->format = strdup( + ArrowAdapter::to_arrow_format(dim.type()).data()); child->name = strdup(dim.name().c_str()); child->metadata = nullptr; child->flags = 0; @@ -130,8 +179,10 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( for (uint32_t i = 0; i < nattr; ++i) { auto attr = tiledb_schema.attribute(i); - child = arrow_schema->children[ndim + i] = new ArrowSchema; - child->format = ArrowAdapter::to_arrow_format(attr.type()).data(); + child = arrow_schema->children[ndim + i] = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); + child->format = strdup( + ArrowAdapter::to_arrow_format(attr.type()).data()); child->name = strdup(attr.name().c_str()); child->metadata = nullptr; child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; @@ -144,7 +195,9 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( if (enmr_name.has_value()) { auto enmr = ArrayExperimental::get_enumeration( *ctx, *tiledb_array, attr.name()); - auto dict = new ArrowSchema; + auto dict = (ArrowSchema*)malloc(sizeof(ArrowSchema)); + dict->format = strdup( + ArrowAdapter::to_arrow_format(enmr.type(), false).data()); if (enmr.type() == TILEDB_STRING_ASCII or enmr.type() == TILEDB_CHAR) { dict->format = strdup("z"); @@ -184,7 +237,7 @@ std::pair ArrowAdapter::_get_data_and_length( // Allocate a single byte to copy the bits into size_t sz = 1; - dst = new const void*[sz]; + dst = malloc(sz); std::memcpy((void*)dst, &src, sz); return std::pair(dst, data.size()); @@ -253,43 +306,57 @@ bool ArrowAdapter::_isstr(const char* format) { return false; } +inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { + if (ec != NANOARROW_OK) + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Arrow Error {} ", msg)); +} + std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { std::unique_ptr schema = std::make_unique(); std::unique_ptr array = std::make_unique(); - - schema->format = to_arrow_format(column->type()).data(); - schema->name = column->name().data(); - schema->metadata = nullptr; - schema->flags = 0; - schema->n_children = 0; - schema->children = nullptr; - schema->dictionary = nullptr; + auto sch = schema.get(); + auto arr = array.get(); + + auto coltype = to_arrow_format(column->type()).data(); + auto natype = to_nanoarrow_type(coltype); + exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); + exitIfError( + ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); + exitIfError( + ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); + // After allocating and initializing via nanoarrow we + // hook our custom release function in schema->release = &release_schema; - schema->private_data = nullptr; + // this will be 3 for char vecs and 2 for enumerations int n_buffers = column->is_var() ? 3 : 2; // Create an ArrowBuffer to manage the lifetime of `column`. - // - `arrow_buffer` holds a shared_ptr to `column`, which - // increments + // - `arrow_buffer` holds shared_ptr to `column`, increments // the use count and keeps the ColumnBuffer data alive. // - When the arrow array is released, `array->release()` is - // called with - // `arrow_buffer` in `private_data`. `arrow_buffer` is - // deleted, which decrements the the `column` use count. When - // the `column` use count reaches 0, the ColumnBuffer data - // will be deleted. + // called with `arrow_buffer` in `private_data`. + // `arrow_buffer` is deleted, which decrements the the + // `column` use count. When the `column` use count reaches + // 0, the ColumnBuffer data will be deleted. auto arrow_buffer = new ArrowBuffer(column); + exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); array->length = column->size(); - array->null_count = 0; - array->offset = 0; - array->n_buffers = n_buffers; - array->n_children = 0; - array->buffers = nullptr; - array->children = nullptr; - array->dictionary = nullptr; + + LOG_TRACE(fmt::format( + "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", + to_arrow_format(column->type()).data(), + column->name().data(), + n_buffers, + array->n_buffers, + column->is_nullable())); + + // After allocating and initializing via nanoarrow we + // hook our custom release function in array->release = &release_array; array->private_data = (void*)arrow_buffer; @@ -298,16 +365,16 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->name(), column.use_count())); - array->buffers = new const void*[n_buffers]; + array->buffers = (const void**)malloc(sizeof(void*) * n_buffers); assert(array->buffers != nullptr); - array->buffers[0] = nullptr; // validity + array->buffers[0] = nullptr; // validity addressed below array->buffers[n_buffers - 1] = column->data().data(); // data if (n_buffers == 3) { array->buffers[1] = column->offsets().data(); // offsets } if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; + schema->flags |= ARROW_FLAG_NULLABLE; // it is also set by default // Count nulls for (auto v : column->validity()) { @@ -317,7 +384,10 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { // Convert validity bytemap to a bitmap in place column->validity_to_bitmap(); array->buffers[0] = column->validity().data(); + } else { + schema->flags = 0; // as ArrowSchemaInitFromType leads to NULLABLE set } + if (column->is_ordered()) { schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; } @@ -327,34 +397,59 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->data_to_bitmap(); } + // Workaround for datetime + if (column->type() == TILEDB_DATETIME_SEC || + column->type() == TILEDB_DATETIME_MS || + column->type() == TILEDB_DATETIME_NS) { + free((void*)schema->format); // free the 'storage' format + schema->format = strdup(to_arrow_format(column->type()).data()); + } + + // Workaround for date + if (column->type() == TILEDB_DATETIME_DAY) { + free((void*)schema->format); // free the 'storage' format + schema->format = strdup(to_arrow_format(column->type()).data()); + // TODO: Put in ColumnBuffer + size_t n = array->length; + std::vector indata(n); + std::memcpy( + indata.data(), column->data().data(), sizeof(int64_t) * n); + std::vector vec(n); + for (size_t i = 0; i < n; i++) { + vec[i] = static_cast(indata[i]); + } + std::memcpy( + (void*)array->buffers[n_buffers - 1], + vec.data(), + sizeof(int32_t) * n); + } + if (column->has_enumeration()) { - auto dict_sch = new ArrowSchema; - auto dict_arr = new ArrowArray; + auto dict_sch = (ArrowSchema*)malloc(sizeof(ArrowSchema)); + auto dict_arr = (ArrowArray*)malloc(sizeof(ArrowArray)); auto enmr = column->get_enumeration_info(); - dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); - dict_sch->name = nullptr; - dict_sch->metadata = nullptr; - dict_sch->flags = 0; - dict_sch->n_children = 0; - dict_sch->children = nullptr; - dict_sch->dictionary = nullptr; + auto dcoltype = to_arrow_format(enmr->type(), false).data(); + auto dnatype = to_nanoarrow_type(dcoltype); + + exitIfError( + ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); + exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); + exitIfError( + ArrowSchemaAllocateChildren(dict_sch, 0), + "Bad schema children alloc"); dict_sch->release = &release_schema; - dict_sch->private_data = nullptr; - - const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2; - dict_arr->null_count = 0; - dict_arr->offset = 0; - dict_arr->n_buffers = n_buf; - dict_arr->n_children = 0; - dict_arr->buffers = nullptr; - dict_arr->children = nullptr; - dict_arr->dictionary = nullptr; - dict_arr->release = &release_array; - dict_arr->private_data = nullptr; - dict_arr->buffers = new const void*[n_buf]; + exitIfError( + ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); + exitIfError( + ArrowArrayAllocateChildren(dict_arr, 0), + "Bad array children alloc"); + const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == true ? 3 : + 2; + dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); dict_arr->buffers[0] = nullptr; // validity: none here + dict_arr->release = &release_array; // TODO string types currently get the data and offset // buffers from ColumnBuffer::enum_offsets and @@ -427,6 +522,8 @@ std::string_view ArrowAdapter::to_arrow_format( return "ttu"; case TILEDB_TIME_NS: return "ttn"; + case TILEDB_DATETIME_DAY: + return "tdD"; case TILEDB_DATETIME_SEC: return "tss:"; case TILEDB_DATETIME_MS: @@ -443,4 +540,51 @@ std::string_view ArrowAdapter::to_arrow_format( tiledb::impl::type_to_str(datatype))); } +// FIXME: Add more types, maybe make it a map +enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { + if (sv == "i") + return NANOARROW_TYPE_INT32; + else if (sv == "c") + return NANOARROW_TYPE_INT8; + else if (sv == "C") + return NANOARROW_TYPE_UINT8; + else if (sv == "s") + return NANOARROW_TYPE_INT16; + else if (sv == "S") + return NANOARROW_TYPE_UINT16; + else if (sv == "I") + return NANOARROW_TYPE_UINT32; + else if (sv == "l") + return NANOARROW_TYPE_INT64; + else if (sv == "L") + return NANOARROW_TYPE_UINT64; + else if (sv == "f") + return NANOARROW_TYPE_FLOAT; + else if (sv == "g") + return NANOARROW_TYPE_DOUBLE; + else if (sv == "u") + return NANOARROW_TYPE_STRING; + else if (sv == "U") + return NANOARROW_TYPE_LARGE_STRING; + else if (sv == "b") + return NANOARROW_TYPE_BOOL; + else if (sv == "tss:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsm:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsn:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsu:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tdD") + return NANOARROW_TYPE_INT32; // R Date: fractional days since epoch + else if (sv == "z") + return NANOARROW_TYPE_BINARY; + else if (sv == "Z") + return NANOARROW_TYPE_LARGE_BINARY; + else + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Unsupported Arrow format: {} ", sv)); +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 7ef09173d1..6417be94c8 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -8,9 +8,8 @@ // https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout // https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-simple-int32-array -#ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED -#include "carrow.h" -#endif +#include "nanoarrow.hpp" + namespace tiledbsoma { using namespace tiledb; @@ -60,6 +59,8 @@ class ArrowAdapter { static std::string_view to_arrow_format( tiledb_datatype_t datatype, bool use_large = true); + static enum ArrowType to_nanoarrow_type(std::string_view sv); + private: static std::pair _get_data_and_length( Enumeration& enmr, const void* dst); diff --git a/libtiledbsoma/src/utils/carrow.h b/libtiledbsoma/src/utils/carrow.h deleted file mode 100644 index f97f3a7a08..0000000000 --- a/libtiledbsoma/src/utils/carrow.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef TILEDBSOMA_CARROW_H -#define TILEDBSOMA_CARROW_H -/* ************************************************************************ */ -/* - * Arrow C Data Interface - * Apache License 2.0 - * source: https://arrow.apache.org/docs/format/CDataInterface.html - */ - -#include - -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 - -struct ArrowSchema { - // Array type description - const char* format; - const char* name; - const char* metadata; - int64_t flags; - int64_t n_children; - struct ArrowSchema** children; - struct ArrowSchema* dictionary; - - // Release callback - void (*release)(struct ArrowSchema*); - // Opaque producer-specific data - void* private_data; -}; - -struct ArrowArray { - // Array data description - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void** buffers; - struct ArrowArray** children; - struct ArrowArray* dictionary; - - // Release callback - void (*release)(struct ArrowArray*); - // Opaque producer-specific data - void* private_data; -}; -/* End Arrow C API */ -/* ************************************************************************ */ -#endif // TILEDBSOMA_CARROW_H diff --git a/apis/r/src/nanoarrow.c b/libtiledbsoma/src/utils/nanoarrow.c similarity index 74% rename from apis/r/src/nanoarrow.c rename to libtiledbsoma/src/utils/nanoarrow.c index 1d31884b19..c946c01362 100644 --- a/apis/r/src/nanoarrow.c +++ b/libtiledbsoma/src/utils/nanoarrow.c @@ -28,7 +28,7 @@ const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } -int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { +ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { if (error == NULL) { return NANOARROW_OK; } @@ -49,12 +49,13 @@ int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { } } -const char* ArrowErrorMessage(struct ArrowError* error) { return error->message; } - void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 1; layout->element_size_bits[1] = 0; @@ -66,43 +67,53 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 0; break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_LARGE_LIST: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; break; + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + case NANOARROW_TYPE_BOOL: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 1; break; case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 8; break; case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_HALF_FLOAT: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 16; break; case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; case NANOARROW_TYPE_INTERVAL_MONTHS: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; @@ -110,49 +121,61 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_INTERVAL_DAY_TIME: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 64; break; case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 128; break; case NANOARROW_TYPE_DECIMAL256: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 256; break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; break; case NANOARROW_TYPE_DENSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_SPARSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; break; case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; case NANOARROW_TYPE_LARGE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; break; default: @@ -169,11 +192,15 @@ void ArrowFree(void* ptr) { free(ptr); } static uint8_t* ArrowBufferAllocatorMallocReallocate( struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(old_size); return (uint8_t*)ArrowRealloc(ptr, new_size); } static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(size); ArrowFree(ptr); } @@ -187,6 +214,10 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { static uint8_t* ArrowBufferAllocatorNeverReallocate( struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(ptr); + NANOARROW_UNUSED(old_size); + NANOARROW_UNUSED(new_size); return NULL; } @@ -200,6 +231,205 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( allocator.private_data = private_data; return allocator; } + +static const int kInt32DecimalDigits = 9; + +static const uint64_t kUInt32PowersOfTen[] = { + 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, + 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL}; + +// Adapted from Arrow C++ to use 32-bit words for better C portability +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544 +static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) { + // We use strtoll for parsing, which needs input that is null-terminated + char chunk_string[16]; + + for (int64_t posn = 0; posn < value.size_bytes;) { + int64_t remaining = value.size_bytes - posn; + + int64_t group_size; + if (remaining > kInt32DecimalDigits) { + group_size = kInt32DecimalDigits; + } else { + group_size = remaining; + } + + const uint64_t multiple = kUInt32PowersOfTen[group_size]; + + memcpy(chunk_string, value.data + posn, group_size); + chunk_string[group_size] = '\0'; + uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10); + + for (int64_t i = 0; i < out_size; i++) { + uint64_t tmp = out[i]; + tmp *= multiple; + tmp += chunk; + out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL); + chunk = (uint32_t)(tmp >> 32); + } + posn += group_size; + } +} + +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value) { + // Check for sign + int is_negative = value.data[0] == '-'; + int has_sign = is_negative || value.data[0] == '+'; + value.data += has_sign; + value.size_bytes -= has_sign; + + // Check all characters are digits that are not the negative sign + for (int64_t i = 0; i < value.size_bytes; i++) { + char c = value.data[i]; + if (c < '0' || c > '9') { + return EINVAL; + } + } + + // Skip over leading 0s + int64_t n_leading_zeroes = 0; + for (int64_t i = 0; i < value.size_bytes; i++) { + if (value.data[i] == '0') { + n_leading_zeroes++; + } else { + break; + } + } + + value.data += n_leading_zeroes; + value.size_bytes -= n_leading_zeroes; + + // Use 32-bit words for portability + uint32_t words32[8]; + int n_words32 = decimal->n_words * 2; + NANOARROW_DCHECK(n_words32 <= 8); + memset(words32, 0, sizeof(words32)); + + ShiftAndAdd(value, words32, n_words32); + + if (decimal->low_word_index == 0) { + memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32); + } else { + uint64_t lo; + uint64_t hi; + + for (int i = 0; i < decimal->n_words; i++) { + lo = (uint64_t)words32[i * 2]; + hi = (uint64_t)words32[i * 2 + 1] << 32; + decimal->words[decimal->n_words - i - 1] = lo | hi; + } + } + + if (is_negative) { + ArrowDecimalNegate(decimal); + } + + return NANOARROW_OK; +} + +// Adapted from Arrow C++ for C +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer) { + int is_negative = ArrowDecimalSign(decimal) < 0; + + uint64_t words_little_endian[4]; + if (decimal->low_word_index == 0) { + memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t)); + } else { + for (int i = 0; i < decimal->n_words; i++) { + words_little_endian[i] = decimal->words[decimal->n_words - i - 1]; + } + } + + // We've already made a copy, so negate that if needed + if (is_negative) { + uint64_t carry = 1; + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = words_little_endian[i]; + elem = ~elem + carry; + carry &= (elem == 0); + words_little_endian[i] = elem; + } + } + + // Find the most significant word that is non-zero + int most_significant_elem_idx = -1; + for (int i = decimal->n_words - 1; i >= 0; i--) { + if (words_little_endian[i] != 0) { + most_significant_elem_idx = i; + break; + } + } + + // If they are all zero, the output is just '0' + if (most_significant_elem_idx == -1) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0')); + return NANOARROW_OK; + } + + // Define segments such that each segment represents 9 digits with the + // least significant group of 9 digits first. For example, if the input represents + // 9876543210123456789, then segments will be [123456789, 876543210, 9]. + // We handle at most a signed 256 bit integer, whose maximum value occupies 77 + // characters. Thus, we need at most 9 segments. + const uint32_t k1e9 = 1000000000U; + int num_segments = 0; + uint32_t segments[9]; + memset(segments, 0, sizeof(segments)); + uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx; + + do { + // Compute remainder = words_little_endian % 1e9 and words_little_endian = + // words_little_endian / 1e9. + uint32_t remainder = 0; + uint64_t* elem = most_significant_elem; + + do { + // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); + // *elem = dividend / 1e9; + // remainder = dividend % 1e9. + uint32_t hi = (uint32_t)(*elem >> 32); + uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL); + uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi; + uint64_t quotient_hi = dividend_hi / k1e9; + remainder = (uint32_t)(dividend_hi % k1e9); + uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo; + uint64_t quotient_lo = dividend_lo / k1e9; + remainder = (uint32_t)(dividend_lo % k1e9); + + *elem = (quotient_hi << 32) | quotient_lo; + } while (elem-- != words_little_endian); + + segments[num_segments++] = remainder; + } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian); + + // We know our output has no more than 9 digits per segment, plus a negative sign, + // plus any further digits between our output of 9 digits plus enough + // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu + // including a the null terminator) is bounded properly. + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9)); + if (is_negative) { + buffer->data[buffer->size_bytes++] = '-'; + } + + // The most significant segment should have no leading zeroes + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", + (unsigned long)segments[num_segments - 1]); + buffer->size_bytes += n_chars; + + // Subsequent output needs to be left-padded with zeroes such that each segment + // takes up exactly 9 digits. + for (int i = num_segments - 2; i >= 0; i--) { + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu", + (unsigned long)segments[i]); + buffer->size_bytes += n_chars; + NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes); + } + + return NANOARROW_OK; +} // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -224,7 +454,8 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( #include "nanoarrow.h" -static void ArrowSchemaRelease(struct ArrowSchema* schema) { +// -- changed for tiledb-r static +void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); if (schema->name != NULL) ArrowFree((void*)schema->name); if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); @@ -236,7 +467,7 @@ static void ArrowSchemaRelease(struct ArrowSchema* schema) { for (int64_t i = 0; i < schema->n_children; i++) { if (schema->children[i] != NULL) { if (schema->children[i]->release != NULL) { - schema->children[i]->release(schema->children[i]); + ArrowSchemaRelease(schema->children[i]); } ArrowFree(schema->children[i]); @@ -251,7 +482,7 @@ static void ArrowSchemaRelease(struct ArrowSchema* schema) { // release() callback. if (schema->dictionary != NULL) { if (schema->dictionary->release != NULL) { - schema->dictionary->release(schema->dictionary); + ArrowSchemaRelease(schema->dictionary); } ArrowFree(schema->dictionary); @@ -373,7 +604,7 @@ void ArrowSchemaInit(struct ArrowSchema* schema) { schema->children = NULL; schema->dictionary = NULL; schema->private_data = NULL; - schema->release = &ArrowSchemaRelease; + schema->release = &ArrowSchemaReleaseInternal; } ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { @@ -409,7 +640,7 @@ ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowTyp int result = ArrowSchemaSetType(schema, type); if (result != NANOARROW_OK) { - schema->release(schema); + ArrowSchemaRelease(schema); return result; } @@ -498,10 +729,33 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum Arrow int n_chars; switch (type) { case NANOARROW_TYPE_TIME32: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_MICRO: + case NANOARROW_TIME_UNIT_NANO: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; case NANOARROW_TYPE_TIME64: if (timezone != NULL) { return EINVAL; } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + case NANOARROW_TIME_UNIT_MILLI: + return EINVAL; + default: + break; + } + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIMESTAMP: @@ -685,13 +939,13 @@ ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { return NANOARROW_OK; } -ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, struct ArrowSchema* schema_out) { ArrowSchemaInit(schema_out); int result = ArrowSchemaSetFormat(schema_out, schema->format); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } @@ -699,26 +953,26 @@ ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, result = ArrowSchemaSetName(schema_out, schema->name); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaSetMetadata(schema_out, schema->metadata); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } for (int64_t i = 0; i < schema->n_children; i++) { result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } } @@ -726,13 +980,13 @@ ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, if (schema->dictionary != NULL) { result = ArrowSchemaAllocateDictionary(schema_out); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } } @@ -814,8 +1068,7 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, // decimal case 'd': if (format[1] != ':' || format[2] == '\0') { - ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'", - format + 3); + ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); return EINVAL; } @@ -1160,13 +1413,15 @@ static ArrowErrorCode ArrowSchemaViewValidateNChildren( for (int64_t i = 0; i < schema_view->schema->n_children; i++) { child = schema_view->schema->children[i]; if (child == NULL) { - ArrowErrorSet(error, "Expected valid schema at schema->children[%d] but found NULL", - i); + ArrowErrorSet(error, + "Expected valid schema at schema->children[%ld] but found NULL", + (long)i); return EINVAL; } else if (child->release == NULL) { ArrowErrorSet( error, - "Expected valid schema at schema->children[%d] but found a released schema", i); + "Expected valid schema at schema->children[%ld] but found a released schema", + (long)i); return EINVAL; } } @@ -1305,7 +1560,8 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie } ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - struct ArrowSchema* schema, struct ArrowError* error) { + const struct ArrowSchema* schema, + struct ArrowError* error) { if (schema == NULL) { ArrowErrorSet(error, "Expected non-NULL schema"); return EINVAL; @@ -1333,8 +1589,7 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, } const char* format_end_out; - ArrowErrorCode result = - ArrowSchemaViewParse(schema_view, format, &format_end_out, error); + int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); if (result != NANOARROW_OK) { if (error != NULL) { @@ -1356,16 +1611,31 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, schema_view->type = NANOARROW_TYPE_DICTIONARY; } - result = ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error); - if (result != NANOARROW_OK) { - return result; - } + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); if (schema_view->storage_type != schema_view->type) { - result = ArrowSchemaViewValidate(schema_view, schema_view->type, error); - if (result != NANOARROW_OK) { - return result; - } + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->type, error)); + } + + int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; + if (unknown_flags != 0) { + ArrowErrorSet(error, "Unknown ArrowSchema flag"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && + schema_view->type != NANOARROW_TYPE_DICTIONARY) { + ArrowErrorSet(error, + "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && + schema_view->type != NANOARROW_TYPE_MAP) { + ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); + return EINVAL; } ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); @@ -1377,10 +1647,12 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, schema_view->extension_name = ArrowCharView(NULL); schema_view->extension_metadata = ArrowCharView(NULL); - ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:name"), - &schema_view->extension_name); - ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:metadata"), - &schema_view->extension_metadata); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:name"), + &schema_view->extension_name)); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:metadata"), + &schema_view->extension_metadata)); return NANOARROW_OK; } @@ -1413,7 +1685,7 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi } } -// Helper for bookeeping to emulate sprintf()-like behaviour spread +// Helper for bookkeeping to emulate sprintf()-like behaviour spread // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { @@ -1431,7 +1703,7 @@ static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, } } -int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, char recursive) { if (schema == NULL) { return snprintf(out, n, "[invalid: pointer is null]"); @@ -1568,7 +1840,9 @@ int64_t ArrowMetadataSizeOf(const char* metadata) { struct ArrowMetadataReader reader; struct ArrowStringView key; struct ArrowStringView value; - ArrowMetadataReaderInit(&reader, metadata); + if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { + return 0; + } int64_t size = sizeof(int32_t); while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { @@ -1584,7 +1858,7 @@ static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, struct ArrowMetadataReader reader; struct ArrowStringView existing_key; struct ArrowStringView existing_value; - ArrowMetadataReaderInit(&reader, metadata); + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == NANOARROW_OK) { @@ -1611,7 +1885,10 @@ ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringVie char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { struct ArrowStringView value = ArrowCharView(NULL); - ArrowMetadataGetValue(metadata, key, &value); + if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { + return 0; + } + return value.data != NULL; } @@ -1749,7 +2026,7 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, #include "nanoarrow.h" // -- changed for tiledb-r static -void ArrowArrayRelease(struct ArrowArray* array) { +void ArrowArrayReleaseInternal(struct ArrowArray* array) { // Release buffers held by this array struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; @@ -1767,7 +2044,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { for (int64_t i = 0; i < array->n_children; i++) { if (array->children[i] != NULL) { if (array->children[i]->release != NULL) { - array->children[i]->release(array->children[i]); + ArrowArrayRelease(array->children[i]); } ArrowFree(array->children[i]); @@ -1782,7 +2059,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { // release() callback. if (array->dictionary != NULL) { if (array->dictionary->release != NULL) { - array->dictionary->release(array->dictionary); + ArrowArrayRelease(array->dictionary); } ArrowFree(array->dictionary); @@ -1794,7 +2071,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { // -- changed for tiledb-r static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, - enum ArrowType storage_type) { + enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: @@ -1861,7 +2138,7 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, array->buffers = NULL; array->children = NULL; array->dictionary = NULL; - array->release = &ArrowArrayRelease; + array->release = &ArrowArrayReleaseInternal; array->private_data = NULL; struct ArrowArrayPrivateData* private_data = @@ -1883,7 +2160,7 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, int result = ArrowArraySetStorageType(array, storage_type); if (result != NANOARROW_OK) { - array->release(array); + ArrowArrayRelease(array); return result; } @@ -1894,26 +2171,45 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, return NANOARROW_OK; } -static ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, - struct ArrowArrayView* array_view, - struct ArrowError* error) { - ArrowArrayInitFromType(array, array_view->storage_type); +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; - int result = ArrowArrayAllocateChildren(array, array_view->n_children); - if (result != NANOARROW_OK) { - array->release(array); - return result; + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + } } - private_data->layout = array_view->layout; + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } - for (int64_t i = 0; i < array_view->n_children; i++) { - int result = - ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); if (result != NANOARROW_OK) { - array->release(array); + ArrowArrayRelease(array); return result; } } @@ -1922,7 +2218,7 @@ static ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, } ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - struct ArrowSchema* schema, + const struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); @@ -1957,9 +2253,7 @@ ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_ch return ENOMEM; } - for (int64_t i = 0; i < n_children; i++) { - array->children[i] = NULL; - } + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); for (int64_t i = 0; i < n_children; i++) { array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); @@ -2027,6 +2321,16 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v ArrowArrayViewInitFromType(array_view, private_data->storage_type); array_view->layout = private_data->layout; array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); if (result != NANOARROW_OK) { @@ -2042,6 +2346,20 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v } } + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + return NANOARROW_OK; } @@ -2103,7 +2421,7 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { case NANOARROW_TYPE_LARGE_BINARY: case NANOARROW_TYPE_LARGE_STRING: if (ArrowArrayBuffer(array, 2)->data == NULL) { - ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0)); } break; default: @@ -2114,6 +2432,10 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + return NANOARROW_OK; } @@ -2121,46 +2443,17 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } for (int64_t i = 0; i < array->n_children; i++) { ArrowArrayFlushInternalPointers(array->children[i]); } -} - -static ArrowErrorCode ArrowArrayCheckInternalBufferSizes( - struct ArrowArray* array, struct ArrowArrayView* array_view, char set_length, - struct ArrowError* error) { - if (set_length) { - ArrowArrayViewSetLength(array_view, array->offset + array->length); - } - for (int64_t i = 0; i < array->n_buffers; i++) { - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - array->null_count == 0 && array->buffers[i] == NULL) { - continue; - } - - int64_t expected_size = array_view->buffer_views[i].size_bytes; - int64_t actual_size = ArrowArrayBuffer(array, i)->size_bytes; - - if (actual_size < expected_size) { - ArrowErrorSet( - error, - "Expected buffer %d to size >= %ld bytes but found buffer with %ld bytes", - (int)i, (long)expected_size, (long)actual_size); - return EINVAL; - } - } - - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayCheckInternalBufferSizes( - array->children[i], array_view->children[i], set_length, error)); + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); } - - return NANOARROW_OK; } ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, @@ -2170,7 +2463,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, // in some implementations (at least one version of Arrow C++ at the time this // was added). Only do this fix if we can assume CPU data access. if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { - NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array)); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); } // Make sure the value we get with array->buffers[i] is set to the actual @@ -2181,44 +2474,11 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, return NANOARROW_OK; } - // Check buffer sizes to make sure we are not sending an ArrowArray - // into the wild that is going to segfault + // For validation, initialize an ArrowArrayView with our known buffer sizes struct ArrowArrayView array_view; - - NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); - - // Check buffer sizes once without using internal buffer data since - // ArrowArrayViewSetArray() assumes that all the buffers are long enough - // and issues invalid reads on offset buffers if they are not - int result = ArrowArrayCheckInternalBufferSizes(array, &array_view, 1, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - if (validation_level == NANOARROW_VALIDATION_LEVEL_MINIMAL) { - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; - } - - result = ArrowArrayViewSetArray(&array_view, array, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - result = ArrowArrayCheckInternalBufferSizes(array, &array_view, 0, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - if (validation_level == NANOARROW_VALIDATION_LEVEL_DEFAULT) { - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; - } - - result = ArrowArrayViewValidateFull(&array_view, error); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); ArrowArrayViewReset(&array_view); return result; } @@ -2265,8 +2525,23 @@ ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, return NANOARROW_OK; } +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - struct ArrowSchema* schema, + const struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowSchemaView schema_view; int result = ArrowSchemaViewInit(&schema_view, schema, error); @@ -2279,6 +2554,7 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); ArrowArrayViewReset(array_view); return result; } @@ -2292,6 +2568,21 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, } } + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); @@ -2300,8 +2591,8 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, } memset(array_view->union_type_id_map, -1, 256); - int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, - array_view->union_type_id_map + 128); + int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, + array_view->union_type_id_map + 128); for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { int8_t type_id = array_view->union_type_id_map[128 + child_index]; array_view->union_type_id_map[type_id] = child_index; @@ -2323,6 +2614,11 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { ArrowFree(array_view->children); } + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + if (array_view->union_type_id_map != NULL) { ArrowFree(array_view->union_type_id_map); } @@ -2331,9 +2627,8 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { } void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; - array_view->buffer_views[i].data.data = NULL; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: @@ -2377,57 +2672,217 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) } } -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - struct ArrowArray* array, - struct ArrowError* error) { +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { array_view->array = array; - - // Check length and offset - if (array->offset < 0) { - ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", - (long)array->offset); - return EINVAL; - } - - if (array->length < 0) { - ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", - (long)array->length); - return EINVAL; - } - - // First pass setting lengths that do not depend on the data buffer - ArrowArrayViewSetLength(array_view, array->offset + array->length); + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; int64_t buffers_required = 0; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } buffers_required++; - // If the null_count is 0, the validity buffer can be NULL - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - array->null_count == 0 && array->buffers[i] == NULL) { + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; } - - array_view->buffer_views[i].data.data = array->buffers[i]; } + // Check the number of buffers if (buffers_required != array->n_buffers) { ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", (int)buffers_required, (int)array->n_buffers); return EINVAL; } + // Check number of children if (array_view->n_children != array->n_children) { ArrowErrorSet(error, "Expected %ld children but found %ld children", (long)array_view->n_children, (long)array->n_children); return EINVAL; } - // Check child sizes and calculate sizes that depend on data in the array buffers + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + if (array_view->length < 0) { + ArrowErrorSet(error, "Expected length >= 0 but found length %ld", + (long)array_view->length); + return EINVAL; + } + + if (array_view->offset < 0) { + ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", + (long)array_view->offset); + return EINVAL; + } + + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (int)i, + (long)min_buffer_size_bytes, + (long)array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", + ArrowTypeString(array_view->storage_type), + (long)array_view->n_children); + return EINVAL; + } + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)(child_min_length), + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %ld but " + "found array with length %ld", + (long)child_min_length, (long)array_view->children[0]->length); + return EINVAL; + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + int64_t first_offset; int64_t last_offset; switch (array_view->storage_type) { @@ -2441,11 +2896,22 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int32[array->offset + array->length]; - array_view->buffer_views[2].size_bytes = last_offset; + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } } break; + case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { @@ -2456,34 +2922,38 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int64[array->offset + array->length]; - array_view->buffer_views[2].size_bytes = last_offset; + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } } break; + case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array_view->n_children; i++) { - if (array->children[i]->length < (array->offset + array->length)) { + if (array_view->children[i]->length < offset_plus_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", - (int)(i + 1), (long)(array->offset + array->length), - (long)array->children[i]->length); + (int)(i + 1), (long)offset_plus_length, + (long)array_view->children[i]->length); return EINVAL; } } break; - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: { - const char* type_name = - array_view->storage_type == NANOARROW_TYPE_LIST ? "list" : "map"; - if (array->n_children != 1) { - ArrowErrorSet(error, "Expected 1 child of %s array but found %d child arrays", - type_name, (int)array->n_children); - return EINVAL; - } + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { @@ -2492,27 +2962,20 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int32[array->offset + array->length]; - if (array->children[0]->length < last_offset) { + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, - "Expected child of %s array with length >= %ld but found array with " + "Expected child of %s array to have length >= %ld but found array with " "length %ld", - type_name, (long)last_offset, (long)array->children[0]->length); + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->children[0]->length); return EINVAL; } } break; - } - case NANOARROW_TYPE_LARGE_LIST: - if (array->n_children != 1) { - ArrowErrorSet(error, - "Expected 1 child of large list array but found %d child arrays", - (int)array->n_children); - return EINVAL; - } + case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { @@ -2521,46 +2984,58 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int64[array->offset + array->length]; - if (array->children[0]->length < last_offset) { + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, - "Expected child of large list array with length >= %ld but found array " + "Expected child of large list array to have length >= %ld but found array " "with length %ld", - (long)last_offset, (long)array->children[0]->length); + (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - if (array->n_children != 1) { - ArrowErrorSet(error, - "Expected 1 child of fixed-size array but found %d child arrays", - (int)array->n_children); - return EINVAL; - } - - last_offset = - (array->offset + array->length) * array_view->layout.child_size_elements; - if (array->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of fixed-size list array with length >= %ld but found array " - "with length %ld", - (long)last_offset, (long)array->children[0]->length); - return EINVAL; - } - break; default: break; } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( - ArrowArrayViewSetArray(array_view->children[i], array->children[i], error)); + ArrowArrayViewValidateDefault(array_view->children[i], error)); } + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + return NANOARROW_OK; } @@ -2571,10 +3046,8 @@ static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { - int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1]; - if (diff < 0) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", - (long)i, (long)diff); + if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } @@ -2589,10 +3062,8 @@ static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { - int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1]; - if (diff < 0) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", - (long)i, (long)diff); + if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } @@ -2635,9 +3106,9 @@ static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, return NANOARROW_OK; } -ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error) { - for (int i = 0; i < 3; i++) { +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: if (array_view->layout.element_size_bits[i] == 32) { @@ -2655,17 +3126,18 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { - // Check that we have valid type ids. if (array_view->union_type_id_map == NULL) { - // If the union_type_id map is NULL - // (e.g., when using ArrowArrayInitFromType() + ArrowArrayAllocateChildren() - // + ArrowArrayFinishBuilding()), we don't have enough information to validate - // this buffer (GH-178). + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( array_view->union_type_id_map, array_view->n_children, array_view->n_children)) { - NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(array_view->buffer_views[0], 0, - array_view->n_children - 1, error)); + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); } else { NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], array_view->union_type_id_map + 128, @@ -2676,27 +3148,53 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && array_view->union_type_id_map != NULL) { // Check that offsets refer to child elements that actually exist - for (int64_t i = 0; i < array_view->array->length; i++) { + for (int64_t i = 0; i < array_view->length; i++) { int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); - int64_t child_length = array_view->array->children[child_id]->length; + int64_t child_length = array_view->children[child_id]->length; if (offset < 0 || offset > child_length) { ArrowErrorSet( error, "[%ld] Expected union offset for child id %d to be between 0 and %ld but " "found offset value %ld", - (long)i, (int)child_id, (long)child_length, offset); + (long)i, (int)child_id, (long)child_length, (long)offset); return EINVAL; } } } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); } + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); + // TODO: validate the indices + } + return NANOARROW_OK; } + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -2756,6 +3254,7 @@ static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, static const char* ArrowBasicArrayStreamGetLastError( struct ArrowArrayStream* array_stream) { + NANOARROW_UNUSED(array_stream); return NULL; } @@ -2768,12 +3267,12 @@ static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) (struct BasicArrayStreamPrivate*)array_stream->private_data; if (private_data->schema.release != NULL) { - private_data->schema.release(&private_data->schema); + ArrowSchemaRelease(&private_data->schema); } for (int64_t i = 0; i < private_data->n_arrays; i++) { if (private_data->arrays[i].release != NULL) { - private_data->arrays[i].release(&private_data->arrays[i]); + ArrowArrayRelease(&private_data->arrays[i]); } } @@ -2787,8 +3286,9 @@ static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, struct ArrowSchema* schema, int64_t n_arrays) { - struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)ArrowMalloc( - sizeof(struct BasicArrayStreamPrivate)); + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); if (private_data == NULL) { return ENOMEM; } @@ -2827,7 +3327,7 @@ void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_ ArrowArrayMove(array, &private_data->arrays[i]); } -ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, struct ArrowError* error) { struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; diff --git a/libtiledbsoma/src/utils/nanoarrow.h b/libtiledbsoma/src/utils/nanoarrow.h new file mode 100644 index 0000000000..db53e4bc94 --- /dev/null +++ b/libtiledbsoma/src/utils/nanoarrow.h @@ -0,0 +1,3930 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUILD_ID_H_INCLUDED +#define NANOARROW_BUILD_ID_H_INCLUDED + +#define NANOARROW_VERSION_MAJOR 0 +#define NANOARROW_VERSION_MINOR 4 +#define NANOARROW_VERSION_PATCH 0 +#define NANOARROW_VERSION "0.4.0-SNAPSHOT" + +#define NANOARROW_VERSION_INT \ + (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ + NANOARROW_VERSION_PATCH) + +// #define NANOARROW_NAMESPACE YourNamespaceHere + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED +#define NANOARROW_NANOARROW_TYPES_H_INCLUDED + +#include +#include + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Extra guard for versions of Arrow without the canonical guard +#ifndef ARROW_FLAG_DICTIONARY_ORDERED + +/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface +/// +/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) +/// and Arrow C Stream +/// (https://arrow.apache.org/docs/format/CStreamInterface.html) interfaces are +/// part of the Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow +/// documentation for documentation of these structures. +/// +/// @{ + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code + // otherwise. + // + // If successful, the ArrowSchema must be released independently from the + // stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code + // otherwise. + // + // If successful, the ArrowArray must be released independently from the + // stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this + // stream (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE +#endif // ARROW_FLAG_DICTIONARY_ORDERED + +/// @} + +// Utility macros +#define _NANOARROW_CONCAT(x, y) x##y +#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) + +#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) \ + return NAME; \ + } while (0) + +#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) + +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet( \ + (ERROR_PTR_EXPR), \ + "%s failed with errno %d\n* %s:%d", \ + EXPR_STR, \ + NAME, \ + __FILE__, \ + __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet( \ + (ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +// For checking ArrowErrorSet() calls for valid printf format strings/arguments +// If using mingw's c99-compliant printf, we need a different format-checking +// attribute +#if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ + __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) +#elif defined(__GNUC__) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) +#else +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +// For checking calls to functions that return ArrowErrorCode +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE __attribute__((warn_unused_result)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1700) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE _Check_return_ +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#endif + +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +#define NANOARROW_UNUSED(x) (void)(x) + +/// \brief Return code for success. +/// \ingroup nanoarrow-errors +#define NANOARROW_OK 0 + +/// \brief Represents an errno-compatible error code +/// \ingroup nanoarrow-errors +typedef int ArrowErrorCode; + +#if defined(NANOARROW_DEBUG) +#define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode +#endif + +/// \brief Flags supported by ArrowSchemaViewInit() +/// \ingroup nanoarrow-schema-view +#define NANOARROW_FLAG_ALL_SUPPORTED \ + (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | \ + ARROW_FLAG_MAP_KEYS_SORTED) + +/// \brief Error type containing a UTF-8 encoded message. +/// \ingroup nanoarrow-errors +struct ArrowError { + /// \brief A character buffer with space for an error message. + char message[1024]; +}; + +/// \brief Ensure an ArrowError is null-terminated by zeroing the first +/// character. \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error != NULL) { + error->message[0] = '\0'; + } +} + +/// \brief Get the contents of an error +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. +static inline const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} + +/// \brief Set the contents of an error from an existing null-terminated string +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorSetString( + struct ArrowError* error, const char* src) { + if (error == NULL) { + return; + } + + int64_t src_len = strlen(src); + if (src_len >= ((int64_t)sizeof(error->message))) { + memcpy(error->message, src, sizeof(error->message) - 1); + error->message[sizeof(error->message) - 1] = '\0'; + } else { + memcpy(error->message, src, src_len); + error->message[src_len] = '\0'; + } +} + +/// \brief Check the result of an expression and return it if not NANOARROW_OK +/// \ingroup nanoarrow-errors +#define NANOARROW_RETURN_NOT_OK(EXPR) \ + _NANOARROW_RETURN_NOT_OK_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) + +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when +/// calling a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), \ + EXPR, \ + ERROR_EXPR, \ + #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf( \ + stderr, \ + "%s failed with code %d\n* %s:%d\n", \ + EXPR_STR, \ + (int)(VALUE), \ + __FILE__, \ + (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) \ + NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is +/// true), print a message to stderr and abort. If nanoarrow was built in +/// release mode, this statement has no effect. You can customize fatal error +/// behaviour be defining the NANOARROW_PRINT_AND_DIE macro before including +/// nanoarrow.h This macro is provided as a convenience for users and is not +/// used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) + +#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ + do { \ + if (!(EXPR)) \ + NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ + } while (0) + +#define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) EXPR +#define NANOARROW_DCHECK(EXPR) +#endif + +static inline void ArrowSchemaMove( + struct ArrowSchema* src, struct ArrowSchema* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; +} + +static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + NANOARROW_DCHECK(schema != NULL); + schema->release(schema); + NANOARROW_DCHECK(schema->release == NULL); +} + +static inline void ArrowArrayMove( + struct ArrowArray* src, struct ArrowArray* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; +} + +static inline void ArrowArrayRelease(struct ArrowArray* array) { + NANOARROW_DCHECK(array != NULL); + array->release(array); + NANOARROW_DCHECK(array->release == NULL); +} + +static inline void ArrowArrayStreamMove( + struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; +} + +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + + const char* value = array_stream->get_last_error(array_stream); + if (value == NULL) { + return ""; + } else { + return value; + } +} + +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, + struct ArrowSchema* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_schema(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, + struct ArrowArray* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_next(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline void ArrowArrayStreamRelease( + struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + array_stream->release(array_stream); + NANOARROW_DCHECK(array_stream->release == NULL); +} + +static char _ArrowIsLittleEndian(void) { + uint32_t check = 1; + char first_byte; + memcpy(&first_byte, &check, sizeof(char)); + return first_byte; +} + +/// \brief Arrow type enumerator +/// \ingroup nanoarrow-utils +/// +/// These names are intended to map to the corresponding arrow::Type::type +/// enumerator; however, the numeric values are specifically not equal +/// (i.e., do not rely on numeric comparison). +enum ArrowType { + NANOARROW_TYPE_UNINITIALIZED = 0, + NANOARROW_TYPE_NA = 1, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_UINT8, + NANOARROW_TYPE_INT8, + NANOARROW_TYPE_UINT16, + NANOARROW_TYPE_INT16, + NANOARROW_TYPE_UINT32, + NANOARROW_TYPE_INT32, + NANOARROW_TYPE_UINT64, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_HALF_FLOAT, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_BINARY, + NANOARROW_TYPE_FIXED_SIZE_BINARY, + NANOARROW_TYPE_DATE32, + NANOARROW_TYPE_DATE64, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_INTERVAL_MONTHS, + NANOARROW_TYPE_INTERVAL_DAY_TIME, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256, + NANOARROW_TYPE_LIST, + NANOARROW_TYPE_STRUCT, + NANOARROW_TYPE_SPARSE_UNION, + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_DICTIONARY, + NANOARROW_TYPE_MAP, + NANOARROW_TYPE_EXTENSION, + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_LARGE_STRING, + NANOARROW_TYPE_LARGE_BINARY, + NANOARROW_TYPE_LARGE_LIST, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO +}; + +/// \brief Get a string value of an enum ArrowType value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + +static inline const char* ArrowTypeString(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_NA: + return "na"; + case NANOARROW_TYPE_BOOL: + return "bool"; + case NANOARROW_TYPE_UINT8: + return "uint8"; + case NANOARROW_TYPE_INT8: + return "int8"; + case NANOARROW_TYPE_UINT16: + return "uint16"; + case NANOARROW_TYPE_INT16: + return "int16"; + case NANOARROW_TYPE_UINT32: + return "uint32"; + case NANOARROW_TYPE_INT32: + return "int32"; + case NANOARROW_TYPE_UINT64: + return "uint64"; + case NANOARROW_TYPE_INT64: + return "int64"; + case NANOARROW_TYPE_HALF_FLOAT: + return "half_float"; + case NANOARROW_TYPE_FLOAT: + return "float"; + case NANOARROW_TYPE_DOUBLE: + return "double"; + case NANOARROW_TYPE_STRING: + return "string"; + case NANOARROW_TYPE_BINARY: + return "binary"; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return "fixed_size_binary"; + case NANOARROW_TYPE_DATE32: + return "date32"; + case NANOARROW_TYPE_DATE64: + return "date64"; + case NANOARROW_TYPE_TIMESTAMP: + return "timestamp"; + case NANOARROW_TYPE_TIME32: + return "time32"; + case NANOARROW_TYPE_TIME64: + return "time64"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "interval_months"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "interval_day_time"; + case NANOARROW_TYPE_DECIMAL128: + return "decimal128"; + case NANOARROW_TYPE_DECIMAL256: + return "decimal256"; + case NANOARROW_TYPE_LIST: + return "list"; + case NANOARROW_TYPE_STRUCT: + return "struct"; + case NANOARROW_TYPE_SPARSE_UNION: + return "sparse_union"; + case NANOARROW_TYPE_DENSE_UNION: + return "dense_union"; + case NANOARROW_TYPE_DICTIONARY: + return "dictionary"; + case NANOARROW_TYPE_MAP: + return "map"; + case NANOARROW_TYPE_EXTENSION: + return "extension"; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return "fixed_size_list"; + case NANOARROW_TYPE_DURATION: + return "duration"; + case NANOARROW_TYPE_LARGE_STRING: + return "large_string"; + case NANOARROW_TYPE_LARGE_BINARY: + return "large_binary"; + case NANOARROW_TYPE_LARGE_LIST: + return "large_list"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "interval_month_day_nano"; + default: + return NULL; + } +} + +/// \brief Arrow time unit enumerator +/// \ingroup nanoarrow-utils +/// +/// These names and values map to the corresponding arrow::TimeUnit::type +/// enumerator. +enum ArrowTimeUnit { + NANOARROW_TIME_UNIT_SECOND = 0, + NANOARROW_TIME_UNIT_MILLI = 1, + NANOARROW_TIME_UNIT_MICRO = 2, + NANOARROW_TIME_UNIT_NANO = 3 +}; + +/// \brief Validation level enumerator +/// \ingroup nanoarrow-array +enum ArrowValidationLevel { + /// \brief Do not validate buffer sizes or content. + NANOARROW_VALIDATION_LEVEL_NONE = 0, + + /// \brief Validate buffer sizes that depend on array length but do not + /// validate buffer + /// sizes that depend on buffer data access. + NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, + + /// \brief Validate all buffer sizes, including those that require buffer + /// data access, + /// but do not perform any checks that are O(1) along the length of the + /// buffers. + NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, + + /// \brief Validate all buffer sizes and all buffer content. This is useful + /// in the + /// context of untrusted input or input that may have been corrupted in + /// transit. + NANOARROW_VALIDATION_LEVEL_FULL = 3 +}; + +/// \brief Get a string value of an enum ArrowTimeUnit value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "ms"; + case NANOARROW_TIME_UNIT_MICRO: + return "us"; + case NANOARROW_TIME_UNIT_NANO: + return "ns"; + default: + return NULL; + } +} + +/// \brief Functional types of buffers as described in the Arrow Columnar +/// Specification \ingroup nanoarrow-array-view +enum ArrowBufferType { + NANOARROW_BUFFER_TYPE_NONE, + NANOARROW_BUFFER_TYPE_VALIDITY, + NANOARROW_BUFFER_TYPE_TYPE_ID, + NANOARROW_BUFFER_TYPE_UNION_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA +}; + +/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \ingroup nanoarrow-array-view +/// +/// All currently supported types have 3 buffers or fewer; however, future types +/// may involve a variable number of buffers (e.g., string view). These buffers +/// will be represented by separate members of the ArrowArrayView or +/// ArrowLayout. +#define NANOARROW_MAX_FIXED_BUFFERS 3 + +/// \brief An non-owning view of a string +/// \ingroup nanoarrow-utils +struct ArrowStringView { + /// \brief A pointer to the start of the string + /// + /// If size_bytes is 0, this value may be NULL. + const char* data; + + /// \brief The size of the string in bytes, + /// + /// (Not including the null terminator.) + int64_t size_bytes; +}; + +/// \brief Return a view of a const C string +/// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + +static inline struct ArrowStringView ArrowCharView(const char* value) { + struct ArrowStringView out; + + out.data = value; + if (value) { + out.size_bytes = (int64_t)strlen(value); + } else { + out.size_bytes = 0; + } + + return out; +} + +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; +}; + +/// \brief An non-owning view of a buffer +/// \ingroup nanoarrow-utils +struct ArrowBufferView { + /// \brief A pointer to the start of the buffer + /// + /// If size_bytes is 0, this value may be NULL. + union ArrowBufferViewData data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; +}; + +/// \brief Array buffer allocation and deallocation +/// \ingroup nanoarrow-buffer +/// +/// Container for allocate, reallocate, and free methods that can be used +/// to customize allocation and deallocation of buffers when constructing +/// an ArrowArray. +struct ArrowBufferAllocator { + /// \brief Reallocate a buffer or return NULL if it cannot be reallocated + uint8_t* (*reallocate)( + struct ArrowBufferAllocator* allocator, + uint8_t* ptr, + int64_t old_size, + int64_t new_size); + + /// \brief Deallocate a buffer allocated by this allocator + void (*free)( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); + + /// \brief Opaque data specific to the allocator + void* private_data; +}; + +/// \brief An owning mutable view of a buffer +/// \ingroup nanoarrow-buffer +struct ArrowBuffer { + /// \brief A pointer to the start of the buffer + /// + /// If capacity_bytes is 0, this value may be NULL. + uint8_t* data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; + + /// \brief The capacity of the buffer in bytes + int64_t capacity_bytes; + + /// \brief The allocator that will be used to reallocate and/or free the + /// buffer + struct ArrowBufferAllocator allocator; +}; + +/// \brief An owning mutable view of a bitmap +/// \ingroup nanoarrow-bitmap +struct ArrowBitmap { + /// \brief An ArrowBuffer to hold the allocated memory + struct ArrowBuffer buffer; + + /// \brief The number of bits that have been appended to the bitmap + int64_t size_bits; +}; + +/// \brief A description of an arrangement of buffers +/// \ingroup nanoarrow-utils +/// +/// Contains the minimum amount of information required to +/// calculate the size of each buffer in an ArrowArray knowing only +/// the length and offset of the array. +struct ArrowLayout { + /// \brief The function of each buffer + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The size of an element each buffer or 0 if this size is variable + /// or unknown + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The number of elements in the child array per element in this + /// array for a fixed-size list + int64_t child_size_elements; +}; + +/// \brief A non-owning view of an ArrowArray +/// \ingroup nanoarrow-array-view +/// +/// This data structure provides access to the values contained within +/// an ArrowArray with fields provided in a more readily-extractible +/// form. You can re-use an ArrowArrayView for multiple ArrowArrays +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. +struct ArrowArrayView { + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + const struct ArrowArray* array; + + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; + + /// \brief The type used to store values in this array + /// + /// This type represents only the minimum required information to + /// extract values from the array buffers (e.g., for a Date32 array, + /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded + /// arrays, this will be the index type. + enum ArrowType storage_type; + + /// \brief The buffer types, strides, and sizes of this Array's buffers + struct ArrowLayout layout; + + /// \brief This Array's buffers as ArrowBufferView objects + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The number of children of this view + int64_t n_children; + + /// \brief Pointers to views of this array's children + struct ArrowArrayView** children; + + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + + /// \brief Union type id to child index mapping + /// + /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer + /// such that child_index == union_type_id_map[type_id] and + /// type_id == union_type_id_map[128 + child_index]. This value may be + /// NULL in the case where child_id == type_id. + int8_t* union_type_id_map; +}; + +// Used as the private data member for ArrowArrays allocated here and accessed +// internally within inline ArrowArray* helpers. +struct ArrowArrayPrivateData { + // Holder for the validity buffer (or first buffer for union types, which + // are the only type whose first buffer is not a valdiity buffer) + struct ArrowBitmap bitmap; + + // Holder for additional buffers as required + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; + + // The array of pointers to buffers. This must be updated after a sequence + // of appends to synchronize its values with the actual buffer addresses + // (which may have ben reallocated uring that time) + const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; + + // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown + enum ArrowType storage_type; + + // The buffer arrangement for the storage type + struct ArrowLayout layout; + + // Flag to indicate if there are non-sequence union type ids. + // In the future this could be replaced with a type id<->child mapping + // to support constructing unions in append mode where type_id != + // child_index + int8_t union_type_id_is_child_index; +}; + +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit( + struct ArrowInterval* interval, enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + +/// \brief A representation of a fixed-precision decimal number +/// \ingroup nanoarrow-utils +/// +/// This structure should be initialized with ArrowDecimalInit() once and +/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), +/// or ArrowDecimalSetBytes256(). +struct ArrowDecimal { + /// \brief An array of 64-bit integers of n_words length defined in + /// native-endian order + uint64_t words[4]; + + /// \brief The number of significant digits this decimal number can + /// represent + int32_t precision; + + /// \brief The number of digits after the decimal point. This can be + /// negative. + int32_t scale; + + /// \brief The number of words in the words array + int n_words; + + /// \brief Cached value used by the implementation + int high_word_index; + + /// \brief Cached value used by the implementation + int low_word_index; +}; + +/// \brief Initialize a decimal with a given set of type parameters +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalInit( + struct ArrowDecimal* decimal, + int32_t bitwidth, + int32_t precision, + int32_t scale) { + memset(decimal->words, 0, sizeof(decimal->words)); + decimal->precision = precision; + decimal->scale = scale; + decimal->n_words = bitwidth / 8 / sizeof(uint64_t); + + if (_ArrowIsLittleEndian()) { + decimal->low_word_index = 0; + decimal->high_word_index = decimal->n_words - 1; + } else { + decimal->low_word_index = decimal->n_words - 1; + decimal->high_word_index = 0; + } +} + +/// \brief Get a signed integer value of a sufficiently small ArrowDecimal +/// +/// This does not check if the decimal's precision sufficiently small to fit +/// within the signed 64-bit integer range (A precision less than or equal +/// to 18 is sufficiently small). +static inline int64_t ArrowDecimalGetIntUnsafe( + const struct ArrowDecimal* decimal) { + return (int64_t)decimal->words[decimal->low_word_index]; +} + +/// \brief Copy the bytes of this decimal into a sufficiently large buffer +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalGetBytes( + const struct ArrowDecimal* decimal, uint8_t* out) { + memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); +} + +/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise +/// \ingroup nanoarrow-utils +static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { + return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); +} + +/// \brief Sets the integer value of this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetInt( + struct ArrowDecimal* decimal, int64_t value) { + if (value < 0) { + memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); + } else { + memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); + } + + decimal->words[decimal->low_word_index] = value; +} + +/// \brief Negate the value of this decimal in place +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { + uint64_t carry = 1; + + if (decimal->low_word_index == 0) { + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } else { + for (int i = decimal->low_word_index; i >= 0; i--) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } +} + +/// \brief Copy bytes from a buffer into this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetBytes( + struct ArrowDecimal* decimal, const uint8_t* value) { + memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_H_INCLUDED +#define NANOARROW_H_INCLUDED + +#include +#include +#include + +// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will +// set this define in nanoarrow_config.h. If not, you can optionally #define +// NANOARROW_NAMESPACE MyNamespace here. + +// This section remaps the non-prefixed symbols to the prefixed symbols so that +// code written against this build can be used independent of the value of +// NANOARROW_NAMESPACE. +#ifdef NANOARROW_NAMESPACE +#define NANOARROW_CAT(A, B) A##B +#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) + +#define ArrowNanoarrowVersion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) +#define ArrowNanoarrowVersionInt \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) +#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) +#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) +#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) +#define ArrowBufferAllocatorDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) +#define ArrowBufferDeallocator \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) +#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) +#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) +#define ArrowDecimalSetDigits \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) +#define ArrowDecimalAppendDigitsToBuffer \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) +#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) +#define ArrowSchemaInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) +#define ArrowSchemaSetType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) +#define ArrowSchemaSetTypeStruct \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) +#define ArrowSchemaSetTypeFixedSize \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) +#define ArrowSchemaSetTypeDecimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) +#define ArrowSchemaSetTypeDateTime \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) +#define ArrowSchemaSetTypeUnion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) +#define ArrowSchemaDeepCopy \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) +#define ArrowSchemaSetFormat \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) +#define ArrowSchemaSetName \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) +#define ArrowSchemaSetMetadata \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) +#define ArrowSchemaAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) +#define ArrowSchemaAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) +#define ArrowMetadataReaderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) +#define ArrowMetadataReaderRead \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) +#define ArrowMetadataSizeOf \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) +#define ArrowMetadataHasKey \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) +#define ArrowMetadataGetValue \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) +#define ArrowMetadataBuilderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) +#define ArrowMetadataBuilderAppend \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) +#define ArrowMetadataBuilderSet \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) +#define ArrowMetadataBuilderRemove \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) +#define ArrowSchemaViewInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) +#define ArrowSchemaToString \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) +#define ArrowArrayInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) +#define ArrowArrayInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) +#define ArrowArrayAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) +#define ArrowArraySetValidityBitmap \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) +#define ArrowArraySetBuffer \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) +#define ArrowArrayReserve \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) +#define ArrowArrayFinishBuilding \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) +#define ArrowArrayFinishBuildingDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) +#define ArrowArrayViewInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) +#define ArrowArrayViewInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) +#define ArrowArrayViewAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) +#define ArrowArrayViewSetLength \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) +#define ArrowArrayViewSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewReset \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) +#define ArrowBasicArrayStreamInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) +#define ArrowBasicArrayStreamSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) +#define ArrowBasicArrayStreamValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/// \defgroup nanoarrow Nanoarrow C library +/// +/// Except where noted, objects are not thread-safe and clients should +/// take care to serialize accesses to methods. +/// +/// Because this library is intended to be vendored, it provides full type +/// definitions and encourages clients to stack or statically allocate +/// where convenient. + +/// \defgroup nanoarrow-malloc Memory management +/// +/// Non-buffer members of a struct ArrowSchema and struct ArrowArray +/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed +/// using ArrowFree() for schemas and arrays allocated here. Buffer members +/// are allocated using an ArrowBufferAllocator. +/// +/// @{ + +/// \brief Allocate like malloc() +void* ArrowMalloc(int64_t size); + +/// \brief Reallocate like realloc() +void* ArrowRealloc(void* ptr, int64_t size); + +/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). +void ArrowFree(void* ptr); + +/// \brief Return the default allocator +/// +/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and +/// ArrowFree(). +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); + +/// \brief Create a custom deallocator +/// +/// Creates a buffer allocator with only a free method that can be used to +/// attach a custom deallocator to an ArrowBuffer. This may be used to +/// avoid copying an existing buffer that was not allocated using the +/// infrastructure provided here (e.g., by an R or Python object). +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size), + void* private_data); + +/// @} + +/// \brief Move the contents of an src ArrowSchema into dst and set src->release +/// to NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaMove( + struct ArrowSchema* src, struct ArrowSchema* dst); + +/// \brief Call the release callback of an ArrowSchema +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaRelease(struct ArrowSchema* schema); + +/// \brief Move the contents of an src ArrowArray into dst and set src->release +/// to NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayMove( + struct ArrowArray* src, struct ArrowArray* dst); + +/// \brief Call the release callback of an ArrowArray +static inline void ArrowArrayRelease(struct ArrowArray* array); + +/// \brief Move the contents of an src ArrowArrayStream into dst and set +/// src->release to NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayStreamMove( + struct ArrowArrayStream* src, struct ArrowArrayStream* dst); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_schema callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, + struct ArrowSchema* out, + struct ArrowError* error); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, + struct ArrowArray* out, + struct ArrowError* error); + +/// \brief Call the get_next callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this function never returns NULL (i.e., its +/// result is safe to use in printf-style error formatters). Null values from +/// the original callback are reported as "". +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream); + +/// \brief Call the release callback of an ArrowArrayStream +static inline void ArrowArrayStreamRelease( + struct ArrowArrayStream* array_stream); + +/// \defgroup nanoarrow-errors Error handling +/// +/// Functions generally return an errno-compatible error code; functions that +/// need to communicate more verbose error information accept a pointer +/// to an ArrowError. This can be stack or statically allocated. The +/// content of the message is undefined unless an error code has been +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, +/// the ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK +/// and NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ +/// clients can use the helpers provided in the nanoarrow.hpp header to +/// facilitate using C++ idioms for memory management and error propgagtion. +/// +/// @{ + +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. +NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet( + struct ArrowError* error, const char* fmt, ...); + +/// @} + +/// \defgroup nanoarrow-utils Utility data structures +/// +/// @{ + +/// \brief Return a version string in the form "major.minor.patch" +const char* ArrowNanoarrowVersion(void); + +/// \brief Return an integer that can be used to compare versions sequentially +int ArrowNanoarrowVersionInt(void); + +/// \brief Initialize a description of buffer arrangements from a storage type +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); + +/// \brief Create a string view from a null-terminated string +static inline struct ArrowStringView ArrowCharView(const char* value); + +/// \brief Sets the integer value of an ArrowDecimal from a string +ArrowErrorCode ArrowDecimalSetDigits( + struct ArrowDecimal* decimal, struct ArrowStringView value); + +/// \brief Get the integer value of an ArrowDecimal as string +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer( + const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer); + +/// @} + +/// \defgroup nanoarrow-schema Creating schemas +/// +/// These functions allocate, copy, and destroy ArrowSchema structures +/// +/// @{ + +/// \brief Initialize an ArrowSchema +/// +/// Initializes the fields and release callback of schema_out. Caller +/// is responsible for calling the schema->release callback if +/// NANOARROW_OK is returned. +void ArrowSchemaInit(struct ArrowSchema* schema); + +/// \brief Initialize an ArrowSchema from an ArrowType +/// +/// A convenience constructor for that calls ArrowSchemaInit() and +/// ArrowSchemaSetType() for the common case of constructing an +/// unparameterized type. The caller is responsible for calling the +/// schema->release callback if NANOARROW_OK is returned. +ArrowErrorCode ArrowSchemaInitFromType( + struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Get a human-readable summary of a Schema +/// +/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) +/// and returns the number of characters required for the output if +/// n were sufficiently large. If recursive is non-zero, the result will +/// also include children. +int64_t ArrowSchemaToString( + const struct ArrowSchema* schema, char* out, int64_t n, char recursive); + +/// \brief Set the format field of a schema from an ArrowType +/// +/// Initializes the fields and release callback of schema_out. For +/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and +/// NANOARROW_TYPE_MAP, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() on the preinitialized children. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetType( + struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Set the format field and initialize children of a struct schema +/// +/// The specified number of children are initialized; however, the caller is +/// responsible for calling ArrowSchemaSetType() and ArrowSchemaSetName() on +/// each child. Schema must have been initialized using ArrowSchemaInit() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeStruct( + struct ArrowSchema* schema, int64_t n_children); + +/// \brief Set the format field of a fixed-size schema +/// +/// Returns EINVAL for fixed_size <= 0 or for type that is not +/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. +/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() the first child. Schema must have been initialized +/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeFixedSize( + struct ArrowSchema* schema, enum ArrowType type, int32_t fixed_size); + +/// \brief Set the format field of a decimal schema +/// +/// Returns EINVAL for scale <= 0 or for type that is not +/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have +/// been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDecimal( + struct ArrowSchema* schema, + enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale); + +/// \brief Set the format field of a time, timestamp, or duration schema +/// +/// Returns EINVAL for type that is not +/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, +/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The +/// timezone parameter must be NULL for a non-timestamp type. Schema must have +/// been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDateTime( + struct ArrowSchema* schema, + enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone); + +/// \brief Seet the format field of a union schema +/// +/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION +/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are +/// allocated, and initialized. +ArrowErrorCode ArrowSchemaSetTypeUnion( + struct ArrowSchema* schema, enum ArrowType type, int64_t n_children); + +/// \brief Make a (recursive) copy of a schema +/// +/// Allocates and copies fields of schema into schema_out. +ArrowErrorCode ArrowSchemaDeepCopy( + const struct ArrowSchema* schema, struct ArrowSchema* schema_out); + +/// \brief Copy format into schema->format +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetFormat( + struct ArrowSchema* schema, const char* format); + +/// \brief Copy name into schema->name +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); + +/// \brief Copy metadata into schema->metadata +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy. +ArrowErrorCode ArrowSchemaSetMetadata( + struct ArrowSchema* schema, const char* metadata); + +/// \brief Allocate the schema->children array +/// +/// Includes the memory for each child struct ArrowSchema. +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateChildren( + struct ArrowSchema* schema, int64_t n_children); + +/// \brief Allocate the schema->dictionary member +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); + +/// @} + +/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata +/// +/// @{ + +/// \brief Reader for key/value pairs in schema metadata +/// +/// The ArrowMetadataReader does not own any data and is only valid +/// for the lifetime of the underlying metadata pointer. +struct ArrowMetadataReader { + /// \brief A metadata string from a schema->metadata field. + const char* metadata; + + /// \brief The current offset into the metadata string + int64_t offset; + + /// \brief The number of remaining keys + int32_t remaining_keys; +}; + +/// \brief Initialize an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderInit( + struct ArrowMetadataReader* reader, const char* metadata); + +/// \brief Read the next key/value pair from an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderRead( + struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out); + +/// \brief The number of bytes in in a key/value metadata string +int64_t ArrowMetadataSizeOf(const char* metadata); + +/// \brief Check for a key in schema metadata +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); + +/// \brief Extract a value from schema metadata +/// +/// If key does not exist in metadata, value_out is unmodified +ArrowErrorCode ArrowMetadataGetValue( + const char* metadata, + struct ArrowStringView key, + struct ArrowStringView* value_out); + +/// \brief Initialize a builder for schema metadata from key/value pairs +/// +/// metadata can be an existing metadata string or NULL to initialize +/// an empty metadata string. +ArrowErrorCode ArrowMetadataBuilderInit( + struct ArrowBuffer* buffer, const char* metadata); + +/// \brief Append a key/value pair to a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderAppend( + struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Set a key/value pair to a buffer containing serialized metadata +/// +/// Ensures that the only entry for key in the metadata is set to value. +/// This function maintains the existing position of (the first instance of) +/// key if present in the data. +ArrowErrorCode ArrowMetadataBuilderSet( + struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Remove a key from a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderRemove( + struct ArrowBuffer* buffer, struct ArrowStringView key); + +/// @} + +/// \defgroup nanoarrow-schema-view Reading schemas +/// +/// @{ + +/// \brief A non-owning view of a parsed ArrowSchema +/// +/// Contains more readily extractable values than a raw ArrowSchema. +/// Clients can stack or statically allocate this structure but are +/// encouraged to use the provided getters to ensure forward +/// compatibility. +struct ArrowSchemaView { + /// \brief A pointer to the schema represented by this view + const struct ArrowSchema* schema; + + /// \brief The data type represented by the schema + /// + /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a + /// non-null dictionary member; datetime types are valid values. + /// This value will never be NANOARROW_TYPE_EXTENSION (see + /// extension_name and/or extension_metadata to check for + /// an extension type). + enum ArrowType type; + + /// \brief The storage data type represented by the schema + /// + /// This value will never be NANOARROW_TYPE_DICTIONARY, + /// NANOARROW_TYPE_EXTENSION or any datetime type. This value represents + /// only the type required to interpret the buffers in the array. + enum ArrowType storage_type; + + /// \brief The storage layout represented by the schema + struct ArrowLayout layout; + + /// \brief The extension type name if it exists + /// + /// If the ARROW:extension:name key is present in schema.metadata, + /// extension_name.data will be non-NULL. + struct ArrowStringView extension_name; + + /// \brief The extension type metadata if it exists + /// + /// If the ARROW:extension:metadata key is present in schema.metadata, + /// extension_metadata.data will be non-NULL. + struct ArrowStringView extension_metadata; + + /// \brief Format fixed size parameter + /// + /// This value is set when parsing a fixed-size binary or fixed-size + /// list schema; this value is undefined for other types. For a + /// fixed-size binary schema this value is in bytes; for a fixed-size + /// list schema this value refers to the number of child elements for + /// each element of the parent. + int32_t fixed_size; + + /// \brief Decimal bitwidth + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_bitwidth; + + /// \brief Decimal precision + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_precision; + + /// \brief Decimal scale + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_scale; + + /// \brief Format time unit parameter + /// + /// This value is set when parsing a date/time type. The value is + /// undefined for other types. + enum ArrowTimeUnit time_unit; + + /// \brief Format timezone parameter + /// + /// This value is set when parsing a timestamp type and represents + /// the timezone format parameter. This value points to + /// data within the schema and is undefined for other types. + const char* timezone; + + /// \brief Union type ids parameter + /// + /// This value is set when parsing a union type and represents + /// type ids parameter. This value points to + /// data within the schema and is undefined for other types. + const char* union_type_ids; +}; + +/// \brief Initialize an ArrowSchemaView +ArrowErrorCode ArrowSchemaViewInit( + struct ArrowSchemaView* schema_view, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-buffer Owning, growable buffers +/// +/// @{ + +/// \brief Initialize an ArrowBuffer +/// +/// Initialize a buffer with a NULL, zero-size buffer using the default +/// buffer allocator. +static inline void ArrowBufferInit(struct ArrowBuffer* buffer); + +/// \brief Set a newly-initialized buffer's allocator +/// +/// Returns EINVAL if the buffer has already been allocated. +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); + +/// \brief Reset an ArrowBuffer +/// +/// Releases the buffer using the allocator's free method if +/// the buffer's data member is non-null, sets the data member +/// to NULL, and sets the buffer's size and capacity to 0. +static inline void ArrowBufferReset(struct ArrowBuffer* buffer); + +/// \brief Move an ArrowBuffer +/// +/// Transfers the buffer data and lifecycle management to another +/// address and resets buffer. +static inline void ArrowBufferMove( + struct ArrowBuffer* src, struct ArrowBuffer* dst); + +/// \brief Grow or shrink a buffer to a given capacity +/// +/// When shrinking the capacity of the buffer, the buffer is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not +/// adjust the buffer's size member except to ensure that the invariant +/// capacity >= size remains true. +static inline ArrowErrorCode ArrowBufferResize( + struct ArrowBuffer* buffer, int64_t new_capacity_bytes, char shrink_to_fit); + +/// \brief Ensure a buffer has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bytes, overallocating when required. +static inline ArrowErrorCode ArrowBufferReserve( + struct ArrowBuffer* buffer, int64_t additional_size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function does not check that buffer has the required capacity +static inline void ArrowBufferAppendUnsafe( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function writes and ensures that the buffer has the required capacity, +/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will +/// overallocate when reallocation is required. +static inline ArrowErrorCode ArrowBufferAppend( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes); + +/// \brief Write fill to buffer and increment the buffer size +/// +/// This function writes the specified number of fill bytes and +/// ensures that the buffer has the required capacity, +static inline ArrowErrorCode ArrowBufferAppendFill( + struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes); + +/// \brief Write an 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt8( + struct ArrowBuffer* buffer, int8_t value); + +/// \brief Write an unsigned 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt8( + struct ArrowBuffer* buffer, uint8_t value); + +/// \brief Write a 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt16( + struct ArrowBuffer* buffer, int16_t value); + +/// \brief Write an unsigned 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt16( + struct ArrowBuffer* buffer, uint16_t value); + +/// \brief Write a 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt32( + struct ArrowBuffer* buffer, int32_t value); + +/// \brief Write an unsigned 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt32( + struct ArrowBuffer* buffer, uint32_t value); + +/// \brief Write a 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt64( + struct ArrowBuffer* buffer, int64_t value); + +/// \brief Write an unsigned 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt64( + struct ArrowBuffer* buffer, uint64_t value); + +/// \brief Write a double to a buffer +static inline ArrowErrorCode ArrowBufferAppendDouble( + struct ArrowBuffer* buffer, double value); + +/// \brief Write a float to a buffer +static inline ArrowErrorCode ArrowBufferAppendFloat( + struct ArrowBuffer* buffer, float value); + +/// \brief Write an ArrowStringView to a buffer +static inline ArrowErrorCode ArrowBufferAppendStringView( + struct ArrowBuffer* buffer, struct ArrowStringView value); + +/// \brief Write an ArrowBufferView to a buffer +static inline ArrowErrorCode ArrowBufferAppendBufferView( + struct ArrowBuffer* buffer, struct ArrowBufferView value); + +/// @} + +/// \defgroup nanoarrow-bitmap Bitmap utilities +/// +/// @{ + +/// \brief Extract a boolean value from a bitmap +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to true +static inline void ArrowBitSet(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to false +static inline void ArrowBitClear(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); + +/// \brief Set a boolean value to a range in a bitmap +static inline void ArrowBitsSetTo( + uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set); + +/// \brief Count true values in a bitmap +static inline int64_t ArrowBitCountSet( + const uint8_t* bits, int64_t i_from, int64_t i_to); + +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8( + const uint8_t* bits, int64_t start_offset, int64_t length, int8_t* out); + +/// \brief Extract int32 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt32( + const uint8_t* bits, int64_t start_offset, int64_t length, int32_t* out); + +/// \brief Initialize an ArrowBitmap +/// +/// Initialize the builder's buffer, empty its cache, and reset the size to zero +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); + +/// \brief Move an ArrowBitmap +/// +/// Transfers the underlying buffer data and lifecycle management to another +/// address and resets the bitmap. +static inline void ArrowBitmapMove( + struct ArrowBitmap* src, struct ArrowBitmap* dst); + +/// \brief Ensure a bitmap builder has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bits, overallocating when required. +static inline ArrowErrorCode ArrowBitmapReserve( + struct ArrowBitmap* bitmap, int64_t additional_size_bits); + +/// \brief Grow or shrink a bitmap to a given capacity +/// +/// When shrinking the capacity of the bitmap, the bitmap is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not +/// adjust the buffer's size member except when shrinking new_capacity_bits +/// to a value less than the current number of bits in the bitmap. +static inline ArrowErrorCode ArrowBitmapResize( + struct ArrowBitmap* bitmap, int64_t new_capacity_bits, char shrink_to_fit); + +/// \brief Reserve space for and append zero or more of the same boolean value +/// to a bitmap +static inline ArrowErrorCode ArrowBitmapAppend( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length); + +/// \brief Append zero or more of the same boolean value to a bitmap +static inline void ArrowBitmapAppendUnsafe( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length); + +/// \brief Append boolean values encoded as int8_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt8Unsafe( + struct ArrowBitmap* bitmap, const int8_t* values, int64_t n_values); + +/// \brief Append boolean values encoded as int32_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt32Unsafe( + struct ArrowBitmap* bitmap, const int32_t* values, int64_t n_values); + +/// \brief Reset a bitmap builder +/// +/// Releases any memory held by buffer, empties the cache, and resets the size +/// to zero +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); + +/// @} + +/// \defgroup nanoarrow-array Creating arrays +/// +/// These functions allocate, copy, and destroy ArrowArray structures. +/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() +/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing +/// it using the embedded release callback. +/// +/// @{ + +/// \brief Initialize the fields of an array +/// +/// Initializes the fields and release callback of array. Caller +/// is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromType( + struct ArrowArray* array, enum ArrowType storage_type); + +/// \brief Initialize the contents of an ArrowArray from an ArrowSchema +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromSchema( + struct ArrowArray* array, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView( + struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error); + +/// \brief Allocate the array->children array +/// +/// Includes the memory for each child struct ArrowArray, +/// whose members are marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// schema must have been allocated using ArrowArrayInitFromType(). +ArrowErrorCode ArrowArrayAllocateChildren( + struct ArrowArray* array, int64_t n_children); + +/// \brief Allocate the array->dictionary member +/// +/// Includes the memory for the struct ArrowArray, whose contents +/// is marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); + +/// \brief Set the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +void ArrowArraySetValidityBitmap( + struct ArrowArray* array, struct ArrowBitmap* bitmap); + +/// \brief Set a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArraySetBuffer( + struct ArrowArray* array, int64_t i, struct ArrowBuffer* buffer); + +/// \brief Get the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBitmap* ArrowArrayValidityBitmap( + struct ArrowArray* array); + +/// \brief Get a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBuffer* ArrowArrayBuffer( + struct ArrowArray* array, int64_t i); + +/// \brief Start element-wise appending to an ArrowArray +/// +/// Initializes any values needed to use ArrowArrayAppend*() functions. +/// All element-wise appenders append by value and return EINVAL if the exact +/// value cannot be represented by the underlying storage type. array must have +/// been allocated using ArrowArrayInitFromType() +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); + +/// \brief Reserve space for future appends +/// +/// For buffer sizes that can be calculated (i.e., not string data buffers or +/// child array sizes for non-fixed-size arrays), recursively reserve space for +/// additional elements. This is useful for reducing the number of reallocations +/// that occur using the item-wise appenders. +ArrowErrorCode ArrowArrayReserve( + struct ArrowArray* array, int64_t additional_size_elements); + +/// \brief Append a null value to an array +static inline ArrowErrorCode ArrowArrayAppendNull( + struct ArrowArray* array, int64_t n); + +/// \brief Append an empty, non-null value to an array +static inline ArrowErrorCode ArrowArrayAppendEmpty( + struct ArrowArray* array, int64_t n); + +/// \brief Append a signed integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendInt( + struct ArrowArray* array, int64_t value); + +/// \brief Append an unsigned integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendUInt( + struct ArrowArray* array, uint64_t value); + +/// \brief Append a double value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range or there is an attempt to append +/// a non-integer to an array with an integer storage type). +static inline ArrowErrorCode ArrowArrayAppendDouble( + struct ArrowArray* array, double value); + +/// \brief Append a string of bytes to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is +/// not a binary, string, large binary, large string, or fixed-size binary +/// array, or value is the wrong size for a fixed-size binary array). +static inline ArrowErrorCode ArrowArrayAppendBytes( + struct ArrowArray* array, struct ArrowBufferView value); + +/// \brief Append a string value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is +/// not a string or large string array). +static inline ArrowErrorCode ArrowArrayAppendString( + struct ArrowArray* array, struct ArrowStringView value); + +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval( + struct ArrowArray* array, const struct ArrowInterval* value); + +/// \brief Append a decimal value to an array +/// +/// Returns NANOARROW_OK if array is a decimal array with the appropriate +/// bitwidth or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendDecimal( + struct ArrowArray* array, const struct ArrowDecimal* value); + +/// \brief Finish a nested array element +/// +/// Appends a non-null element to the array based on the first child's current +/// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW +/// if the child of a list or map array would exceed INT_MAX elements, or EINVAL +/// if the underlying storage type is not a struct, list, large list, or +/// fixed-size list, or if there was an attempt to add a struct or fixed-size +/// list element where the length of the child array(s) did not match the +/// expected length. +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); + +/// \brief Finish a union array element +/// +/// Appends an element to the union type ids buffer and increments +/// array->length. For sparse unions, up to one element is added to non type-id +/// children. Returns EINVAL if the underlying storage type is not a union, if +/// type_id is not valid, or if child sizes after appending are inconsistent. +static inline ArrowErrorCode ArrowArrayFinishUnionElement( + struct ArrowArray* array, int8_t type_id); + +/// \brief Shrink buffer capacity to the size required +/// +/// Also applies shrinking to any child arrays. array must have been allocated +/// using ArrowArrayInitFromType +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); + +/// \brief Finish building an ArrowArray +/// +/// Flushes any pointers from internal buffers that may have been reallocated +/// into array->buffers and checks the actual size of the buffers +/// against the expected size based on the final length. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayFinishBuildingDefault( + struct ArrowArray* array, struct ArrowError* error); + +/// \brief Finish building an ArrowArray with explicit validation +/// +/// Finish building with an explicit validation level. This could perform less +/// validation (i.e. NANOARROW_VALIDATION_LEVEL_NONE or +/// NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU buffer data access is not +/// possible or more validation (i.e., NANOARROW_VALIDATION_LEVEL_FULL) if +/// buffer content was obtained from an untrusted or corruptible source. +ArrowErrorCode ArrowArrayFinishBuilding( + struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-array-view Reading arrays +/// +/// These functions read and validate the contents ArrowArray structures. +/// +/// @{ + +/// \brief Initialize the contents of an ArrowArrayView +void ArrowArrayViewInitFromType( + struct ArrowArrayView* array_view, enum ArrowType storage_type); + +/// \brief Move an ArrowArrayView +/// +/// Transfers the ArrowArrayView data and lifecycle management to another +/// address and resets the contents of src. +static inline void ArrowArrayViewMove( + struct ArrowArrayView* src, struct ArrowArrayView* dst); + +/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema +ArrowErrorCode ArrowArrayViewInitFromSchema( + struct ArrowArrayView* array_view, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Allocate the array_view->children array +/// +/// Includes the memory for each child struct ArrowArrayView +ArrowErrorCode ArrowArrayViewAllocateChildren( + struct ArrowArrayView* array_view, int64_t n_children); + +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary( + struct ArrowArrayView* array_view); + +/// \brief Set data-independent buffer sizes from length +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); + +/// \brief Set buffer sizes and data pointers from an ArrowArray +ArrowErrorCode ArrowArrayViewSetArray( + struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Set buffer sizes and data pointers from an ArrowArray except for +/// those that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal( + struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. +/// See documentation for ArrowValidationLevel for the details of checks +/// performed at each level. +ArrowErrorCode ArrowArrayViewValidate( + struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// \brief Reset the contents of an ArrowArrayView and frees resources +void ArrowArrayViewReset(struct ArrowArrayView* array_view); + +/// \brief Check for a null element in an ArrowArrayView +static inline int8_t ArrowArrayViewIsNull( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the type id of a union array element +static inline int8_t ArrowArrayViewUnionTypeId( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the child index of a union array element +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the index to use into the relevant union child array +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an integer +/// +/// This function does not check for null values, that values are actually +/// integers, or that values are within a valid range for an int64. +static inline int64_t ArrowArrayViewGetIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an unsigned integer +/// +/// This function does not check for null values, that values are actually +/// integers, or that values are within a valid range for a uint64. +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as a double +/// +/// This function does not check for null values, or +/// that values are within a valid range for a double. +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowStringView +/// +/// This function does not check for null values. +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowBufferView +/// +/// This function does not check for null values. +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowDecimal +/// +/// This function does not check for null values. The out parameter must +/// be initialized with ArrowDecimalInit() with the proper parameters for this +/// type before calling this for the first time. +static inline void ArrowArrayViewGetDecimalUnsafe( + const struct ArrowArrayView* array_view, + int64_t i, + struct ArrowDecimal* out); + +/// @} + +/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation +/// +/// An implementation of an ArrowArrayStream based on a collection of +/// zero or more previously-existing ArrowArray objects. Users should +/// initialize and/or validate the contents before transferring the +/// responsibility of the ArrowArrayStream elsewhere. +/// +/// @{ + +/// \brief Initialize an ArrowArrayStream backed by this implementation +/// +/// This function moves the ownership of schema to the array_stream. If +/// this function returns NANOARROW_OK, the caller is responsible for +/// releasing the ArrowArrayStream. +ArrowErrorCode ArrowBasicArrayStreamInit( + struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, + int64_t n_arrays); + +/// \brief Set the ith ArrowArray in this ArrowArrayStream. +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function move the ownership of array to the array_stream. i must +/// be greater than zero and less than the value of n_arrays passed in +/// ArrowBasicArrayStreamInit(). Callers are not required to fill all +/// n_arrays members (i.e., n_arrays is a maximum bound). +void ArrowBasicArrayStreamSetArray( + struct ArrowArrayStream* array_stream, int64_t i, struct ArrowArray* array); + +/// \brief Validate the contents of this ArrowArrayStream +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function uses ArrowArrayStreamInitFromSchema() and +/// ArrowArrayStreamSetArray() to validate the contents of the arrays. +ArrowErrorCode ArrowBasicArrayStreamValidate( + const struct ArrowArrayStream* array_stream, struct ArrowError* error); + +/// @} + +// Undefine ArrowErrorCode, which may have been defined to annotate functions +// that return it to warn for an unused result. +#if defined(ArrowErrorCode) +#undef ArrowErrorCode +#endif + +// Inline function definitions + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED +#define NANOARROW_BUFFER_INLINE_H_INCLUDED + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int64_t _ArrowGrowByFactor( + int64_t current_capacity, int64_t new_capacity) { + int64_t doubled_capacity = current_capacity * 2; + if (doubled_capacity > new_capacity) { + return doubled_capacity; + } else { + return new_capacity; + } +} + +static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { + buffer->data = NULL; + buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->allocator = ArrowBufferAllocatorDefault(); +} + +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { + if (buffer->data == NULL) { + buffer->allocator = allocator; + return NANOARROW_OK; + } else { + return EINVAL; + } +} + +static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { + if (buffer->data != NULL) { + buffer->allocator.free( + &buffer->allocator, (uint8_t*)buffer->data, buffer->capacity_bytes); + buffer->data = NULL; + } + + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; +} + +static inline void ArrowBufferMove( + struct ArrowBuffer* src, struct ArrowBuffer* dst) { + memcpy(dst, src, sizeof(struct ArrowBuffer)); + src->data = NULL; + ArrowBufferReset(src); +} + +static inline ArrowErrorCode ArrowBufferResize( + struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit) { + if (new_capacity_bytes < 0) { + return EINVAL; + } + + if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { + buffer->data = buffer->allocator.reallocate( + &buffer->allocator, + buffer->data, + buffer->capacity_bytes, + new_capacity_bytes); + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_capacity_bytes; + } + + // Ensures that when shrinking that size <= capacity + if (new_capacity_bytes < buffer->size_bytes) { + buffer->size_bytes = new_capacity_bytes; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferReserve( + struct ArrowBuffer* buffer, int64_t additional_size_bytes) { + int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; + if (min_capacity_bytes <= buffer->capacity_bytes) { + return NANOARROW_OK; + } + + return ArrowBufferResize( + buffer, + _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), + 0); +} + +static inline void ArrowBufferAppendUnsafe( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { + if (size_bytes > 0) { + memcpy(buffer->data + buffer->size_bytes, data, size_bytes); + buffer->size_bytes += size_bytes; + } +} + +static inline ArrowErrorCode ArrowBufferAppend( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + ArrowBufferAppendUnsafe(buffer, data, size_bytes); + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferAppendInt8( + struct ArrowBuffer* buffer, int8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt8( + struct ArrowBuffer* buffer, uint8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt16( + struct ArrowBuffer* buffer, int16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt16( + struct ArrowBuffer* buffer, uint16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt32( + struct ArrowBuffer* buffer, int32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt32( + struct ArrowBuffer* buffer, uint32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt64( + struct ArrowBuffer* buffer, int64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt64( + struct ArrowBuffer* buffer, uint64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendDouble( + struct ArrowBuffer* buffer, double value) { + return ArrowBufferAppend(buffer, &value, sizeof(double)); +} + +static inline ArrowErrorCode ArrowBufferAppendFloat( + struct ArrowBuffer* buffer, float value) { + return ArrowBufferAppend(buffer, &value, sizeof(float)); +} + +static inline ArrowErrorCode ArrowBufferAppendStringView( + struct ArrowBuffer* buffer, struct ArrowStringView value) { + return ArrowBufferAppend(buffer, value.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendBufferView( + struct ArrowBuffer* buffer, struct ArrowBufferView value) { + return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendFill( + struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + memset(buffer->data + buffer->size_bytes, value, size_bytes); + buffer->size_bytes += size_bytes; + return NANOARROW_OK; +} + +static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; +static const uint8_t _ArrowkFlippedBitmask[] = { + 254, 253, 251, 247, 239, 223, 191, 127}; +static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +static const uint8_t _ArrowkTrailingBitmask[] = { + 255, 254, 252, 248, 240, 224, 192, 128}; + +static const uint8_t _ArrowkBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { + return (value + 7) & ~((int64_t)7); +} + +static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { + return (value / 8) * 8; +} + +static inline int64_t _ArrowBytesForBits(int64_t bits) { + return (bits >> 3) + ((bits & 7) != 0); +} + +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { + *out = + (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); +} + +static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { + *out = + (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); +} + +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; +} + +static inline void ArrowBitsUnpackInt8( + const uint8_t* bits, int64_t start_offset, int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitsUnpackInt32( + const uint8_t* bits, int64_t start_offset, int64_t length, int32_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt32(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitSet(uint8_t* bits, int64_t i) { + bits[i / 8] |= _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitClear(uint8_t* bits, int64_t i) { + bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; +} + +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { + bits[i / 8] ^= ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & + _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitsSetTo( + uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set) { + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const uint8_t fill_byte = (uint8_t)(-bits_are_set); + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_end = i_end / 8 + 1; + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const uint8_t + only_byte_mask = i_end % 8 == 0 ? + first_byte_mask : + (uint8_t)(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + memset( + bits + bytes_begin + 1, + fill_byte, + (size_t)(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) { + return; + } + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); +} + +static inline int64_t ArrowBitCountSet( + const uint8_t* bits, int64_t start_offset, int64_t length) { + if (length == 0) { + return 0; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + // count bits within a single byte + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; + + const uint8_t + only_byte_mask = i_end % 8 == 0 ? + last_byte_mask : + (uint8_t)(first_byte_mask & last_byte_mask); + + const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; + return _ArrowkBytePopcount[byte_masked]; + } + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? + 0 : + _ArrowkTrailingBitmask[i_end % 8]; + int64_t count = 0; + + // first byte + count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + count += _ArrowkBytePopcount[bits[i]]; + } + + // last byte + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; + + return count; +} + +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { + ArrowBufferInit(&bitmap->buffer); + bitmap->size_bits = 0; +} + +static inline void ArrowBitmapMove( + struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBufferMove(&src->buffer, &dst->buffer); + dst->size_bits = src->size_bits; + src->size_bits = 0; +} + +static inline ArrowErrorCode ArrowBitmapReserve( + struct ArrowBitmap* bitmap, int64_t additional_size_bits) { + int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; + if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { + return NANOARROW_OK; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( + &bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); + + bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapResize( + struct ArrowBitmap* bitmap, int64_t new_capacity_bits, char shrink_to_fit) { + if (new_capacity_bits < 0) { + return EINVAL; + } + + int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); + + if (new_capacity_bits < bitmap->size_bits) { + bitmap->size_bits = new_capacity_bits; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapAppend( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); + + ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); + return NANOARROW_OK; +} + +static inline void ArrowBitmapAppendUnsafe( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length) { + ArrowBitsSetTo( + bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); + bitmap->size_bits += length; + bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); +} + +static inline void ArrowBitmapAppendInt8Unsafe( + struct ArrowBitmap* bitmap, const int8_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int8_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - + out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt8(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo( + bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapAppendInt32Unsafe( + struct ArrowBitmap* bitmap, const int32_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int32_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - + out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo( + bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt32(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo( + bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { + ArrowBufferReset(&bitmap->buffer); + bitmap->size_bits = 0; +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED +#define NANOARROW_ARRAY_INLINE_H_INCLUDED + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline struct ArrowBitmap* ArrowArrayValidityBitmap( + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + return &private_data->bitmap; +} + +static inline struct ArrowBuffer* ArrowArrayBuffer( + struct ArrowArray* array, int64_t i) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + switch (i) { + case 0: + return &private_data->bitmap.buffer; + default: + return private_data->buffers + i - 1; + } +} + +// We don't currently support the case of unions where type_id != child_index; +// however, these functions are used to keep track of where that assumption +// is made. +static inline int8_t _ArrowArrayUnionChildIndex( + struct ArrowArray* array, int8_t type_id) { + NANOARROW_UNUSED(array); + return type_id; +} + +static inline int8_t _ArrowArrayUnionTypeId( + struct ArrowArray* array, int8_t child_index) { + NANOARROW_UNUSED(array); + return child_index; +} + +static inline int32_t _ArrowParseUnionTypeIds( + const char* type_ids, int8_t* out) { + if (*type_ids == '\0') { + return 0; + } + + int32_t i = 0; + long type_id; + char* end_ptr; + do { + type_id = strtol(type_ids, &end_ptr, 10); + if (end_ptr == type_ids || type_id < 0 || type_id > 127) { + return -1; + } + + if (out != NULL) { + out[i] = (int8_t)type_id; + } + + i++; + + type_ids = end_ptr; + if (*type_ids == '\0') { + return i; + } else if (*type_ids != ',') { + return -1; + } else { + type_ids++; + } + } while (1); + + return -1; +} + +static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices( + const int8_t* type_ids, int64_t n_type_ids, int64_t n_children) { + if (n_type_ids != n_children) { + return 0; + } + + for (int8_t i = 0; i < n_type_ids; i++) { + if (type_ids[i] != i) { + return 0; + } + } + + return 1; +} + +static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices( + const char* type_id_str, int64_t n_children) { + int8_t type_ids[128]; + int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + return _ArrowParsedUnionTypeIdsWillEqualChildIndices( + type_ids, n_type_ids, n_children); +} + +static inline ArrowErrorCode ArrowArrayStartAppending( + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + return EINVAL; + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + // Note that this value could be -1 if the type_ids string was + // invalid + if (private_data->union_type_id_is_child_index != 1) { + return EINVAL; + } else { + break; + } + default: + break; + } + if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + // Initialize any data offset buffer with a single zero + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (private_data->layout.buffer_type[i] == + NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 64) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); + } else if ( + private_data->layout.buffer_type[i] == + NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); + } + } + + // Start building any child arrays or dictionaries + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(buffer, buffer->size_bytes, 1)); + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendBits( + struct ArrowArray* array, int64_t buffer_i, uint8_t value, int64_t n) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + int64_t bytes_required = _ArrowRoundUpToMultipleOf8( + private_data->layout + .element_size_bits[buffer_i] * + (array->length + 1)) / + 8; + if (bytes_required > buffer->size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill( + buffer, 0, bytes_required - buffer->size_bytes)); + } + + ArrowBitsSetTo(buffer->data, array->length, n, value); + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal( + struct ArrowArray* array, int64_t n, uint8_t is_valid) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + if (n == 0) { + return NANOARROW_OK; + } + + // Some type-specific handling + switch (private_data->storage_type) { + case NANOARROW_TYPE_NA: + // (An empty value for a null array *is* a null) + array->null_count += n; + array->length += n; + return NANOARROW_OK; + + case NANOARROW_TYPE_DENSE_UNION: { + // Add one null to the first child and append n references to that + // child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendEmptyInternal( + array->children[0], 1, is_valid)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + for (int64_t i = 0; i < n; i++) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), + (int32_t)array->children[0]->length - 1)); + } + // For the purposes of array->null_count, union elements are never + // considered "null" even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_SPARSE_UNION: { + // Add n nulls to the first child and append n references to that + // child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendEmptyInternal( + array->children[0], n, is_valid)); + for (int64_t i = 1; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayAppendEmpty(array->children[i], n)); + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + // For the purposes of array->null_count, union elements are never + // considered "null" even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( + array->children[0], + n * private_data->layout.child_size_elements)); + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayAppendEmpty(array->children[i], n)); + } + break; + + default: + break; + } + + // Append n is_valid bits to the validity bitmap. If we haven't allocated a + // bitmap yet and we need to append nulls, do it now. + if (!is_valid && private_data->bitmap.buffer.data == NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapReserve(&private_data->bitmap, array->length + n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } else if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } + + // Add appropriate buffer fill + struct ArrowBuffer* buffer; + int64_t size_bytes; + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + buffer = ArrowArrayBuffer(array, i); + size_bytes = private_data->layout.element_size_bits[i] / 8; + + switch (private_data->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VALIDITY: + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Append the current value at the end of the offset buffer for + // each element + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(buffer, size_bytes * n)); + + for (int64_t j = 0; j < n; j++) { + ArrowBufferAppendUnsafe( + buffer, + buffer->data + size_bytes * (array->length + j), + size_bytes); + } + + // Skip the data buffer + i++; + continue; + case NANOARROW_BUFFER_TYPE_DATA: + // Zero out the next bit of memory + if (private_data->layout.element_size_bits[i] % 8 == 0) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(buffer, 0, size_bytes * n)); + } else { + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendBits(array, i, 0, n)); + } + continue; + + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + // These cases return above + return EINVAL; + } + } + + array->length += n; + array->null_count += n * !is_valid; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendNull( + struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 0); +} + +static inline ArrowErrorCode ArrowArrayAppendEmpty( + struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 1); +} + +static inline ArrowErrorCode ArrowArrayAppendInt( + struct ArrowArray* array, int64_t value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INT64: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); + break; + case NANOARROW_TYPE_INT32: + _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, (int32_t)value)); + break; + case NANOARROW_TYPE_INT16: + _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt16(data_buffer, (int16_t)value)); + break; + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(data_buffer, (int8_t)value)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + return ArrowArrayAppendUInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendUInt( + struct ArrowArray* array, uint64_t value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UINT64: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); + break; + case NANOARROW_TYPE_UINT32: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); + break; + case NANOARROW_TYPE_UINT16: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); + break; + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); + break; + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); + return ArrowArrayAppendInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDouble( + struct ArrowArray* array, double value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, &value, sizeof(double))); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendBytes( + struct ArrowArray* array, struct ArrowBufferView value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, + 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; + } + + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendString( + struct ArrowArray* array, struct ArrowStringView value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBufferView buffer_view; + buffer_view.data.data = value.data; + buffer_view.size_bytes = value.size_bytes; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return ArrowArrayAppendBytes(array, buffer_view); + default: + return EINVAL; + } +} + +static inline ArrowErrorCode ArrowArrayAppendInterval( + struct ArrowArray* array, const struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDecimal( + struct ArrowArray* array, const struct ArrowDecimal* value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + if (value->n_words != 2) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value->words, 2 * sizeof(uint64_t))); + break; + } + case NANOARROW_TYPE_DECIMAL256: + if (value->n_words != 4) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value->words, 4 * sizeof(uint64_t))); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + int64_t child_length; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + child_length = array->children[0]->length; + if (child_length > INT32_MAX) { + return EOVERFLOW; + } + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)child_length)); + break; + case NANOARROW_TYPE_LARGE_LIST: + child_length = array->children[0]->length; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64( + ArrowArrayBuffer(array, 1), child_length)); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_length = array->children[0]->length; + if (child_length != ((array->length + 1) * + private_data->layout.child_size_elements)) { + return EINVAL; + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + child_length = array->children[i]->length; + if (child_length != (array->length + 1)) { + return EINVAL; + } + } + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishUnionElement( + struct ArrowArray* array, int8_t type_id) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); + if (child_index < 0 || child_index >= array->n_children) { + return EINVAL; + } + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + // Append the target child length to the union offsets buffer + _NANOARROW_CHECK_RANGE( + array->children[child_index]->length, 0, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), + (int32_t)array->children[child_index]->length - 1)); + break; + case NANOARROW_TYPE_SPARSE_UNION: + // Append one empty to any non-target column that isn't already the + // right length or abort if appending a null will result in a column + // with invalid length + for (int64_t i = 0; i < array->n_children; i++) { + if (i == child_index || + array->children[i]->length == (array->length + 1)) { + continue; + } + + if (array->children[i]->length != array->length) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowArrayAppendEmpty(array->children[i], 1)); + } + + break; + default: + return EINVAL; + } + + // Write to the type_ids buffer + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); + array->length++; + return NANOARROW_OK; +} + +static inline void ArrowArrayViewMove( + struct ArrowArrayView* src, struct ArrowArrayView* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayView)); + ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); +} + +static inline int8_t ArrowArrayViewIsNull( + const struct ArrowArrayView* array_view, int64_t i) { + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return 0x01; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0x00; + default: + return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); + } +} + +static inline int8_t ArrowArrayViewUnionTypeId( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->buffer_views[0].data.as_int8[i]; + default: + return -1; + } +} + +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i) { + int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); + if (array_view->union_type_id_map == NULL) { + return type_id; + } else { + return array_view->union_type_id_map[type_id]; + } +} + +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_SPARSE_UNION: + return i; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewListChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewGetIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (int64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return INT64_MAX; + } +} + +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (uint64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return UINT64_MAX; + } +} + +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return (double)data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return (double)data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return DBL_MAX; + } +} + +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const char* data_view = array_view->buffer_views[2].data.as_char; + + struct ArrowStringView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.data = data_view + offsets_view->data.as_int32[i]; + view.size_bytes = offsets_view->data.as_int32[i + 1] - + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.data = data_view + offsets_view->data.as_int64[i]; + view.size_bytes = offsets_view->data.as_int64[i + 1] - + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data = array_view->buffer_views[1].data.as_char + + (i * view.size_bytes); + break; + default: + view.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; + + struct ArrowBufferView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.size_bytes = offsets_view->data.as_int32[i + 1] - + offsets_view->data.as_int32[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.size_bytes = offsets_view->data.as_int64[i + 1] - + offsets_view->data.as_int64[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data.as_uint8 = array_view->buffer_views[1].data.as_uint8 + + (i * view.size_bytes); + break; + default: + view.data.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline void ArrowArrayViewGetIntervalUnsafe( + const struct ArrowArrayView* array_view, + int64_t i, + struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe( + const struct ArrowArrayView* array_view, + int64_t i, + struct ArrowDecimal* out) { + i += array_view->offset; + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + ArrowDecimalSetBytes(out, data_view + (i * 16)); + break; + case NANOARROW_TYPE_DECIMAL256: + ArrowDecimalSetBytes(out, data_view + (i * 32)); + break; + default: + memset(out->words, 0, sizeof(out->words)); + break; + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libtiledbsoma/src/utils/nanoarrow.hpp b/libtiledbsoma/src/utils/nanoarrow.hpp new file mode 100644 index 0000000000..8d5b841e28 --- /dev/null +++ b/libtiledbsoma/src/utils/nanoarrow.hpp @@ -0,0 +1,501 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "nanoarrow.h" + +#ifndef NANOARROW_HPP_INCLUDED +#define NANOARROW_HPP_INCLUDED + +/// \defgroup nanoarrow_hpp Nanoarrow C++ Helpers +/// +/// The utilities provided in this file are intended to support C++ users +/// of the nanoarrow C library such that C++-style resource allocation +/// and error handling can be used with nanoarrow data structures. +/// These utilities are not intended to mirror the nanoarrow C API. + +namespace nanoarrow { + +/// \defgroup nanoarrow_hpp-errors Error handling helpers +/// +/// Most functions in the C API return an ArrowErrorCode to communicate +/// possible failure. Except where documented, it is usually not safe to +/// continue after a non-zero value has been returned. While the +/// nanoarrow C++ helpers do not throw any exceptions of their own, +/// these helpers are provided to facilitate using the nanoarrow C++ helpers +/// in frameworks where this is a useful error handling idiom. +/// +/// @{ + +class Exception : public std::exception { + public: + Exception(const std::string& msg) : msg_(msg) {} + const char* what() const noexcept { return msg_.c_str(); } + + private: + std::string msg_; +}; + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + throw nanoarrow::Exception( \ + std::string(EXPR_STR) + std::string(" failed with errno ") + \ + std::to_string(NAME) + std::string("\n * ") + std::string(__FILE__) + \ + std::string(":") + std::to_string(__LINE__) + std::string("\n")); \ + } \ + } while (0) +#else +#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + throw nanoarrow::Exception(std::string(EXPR_STR) + \ + std::string(" failed with errno ") + \ + std::to_string(NAME)); \ + } \ + } while (0) +#endif + +#define NANOARROW_THROW_NOT_OK(EXPR) \ + _NANOARROW_THROW_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, \ + #EXPR) + +/// @} + +namespace internal { + +/// \defgroup nanoarrow_hpp-unique_base Base classes for Unique wrappers +/// +/// @{ + +template +static inline void init_pointer(T* data); + +template +static inline void move_pointer(T* src, T* dst); + +template +static inline void release_pointer(T* data); + +template <> +inline void init_pointer(struct ArrowSchema* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowSchema* src, struct ArrowSchema* dst) { + ArrowSchemaMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowSchema* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowArray* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowArray* src, struct ArrowArray* dst) { + ArrowArrayMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowArray* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowArrayStream* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { + ArrowArrayStreamMove(src, dst); +} + +template <> +inline void release_pointer(ArrowArrayStream* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowBuffer* data) { + ArrowBufferInit(data); +} + +template <> +inline void move_pointer(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + ArrowBufferMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowBuffer* data) { + ArrowBufferReset(data); +} + +template <> +inline void init_pointer(struct ArrowBitmap* data) { + ArrowBitmapInit(data); +} + +template <> +inline void move_pointer(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBitmapMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowBitmap* data) { + ArrowBitmapReset(data); +} + +template <> +inline void init_pointer(struct ArrowArrayView* data) { + ArrowArrayViewInitFromType(data, NANOARROW_TYPE_UNINITIALIZED); +} + +template <> +inline void move_pointer(struct ArrowArrayView* src, struct ArrowArrayView* dst) { + ArrowArrayViewMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowArrayView* data) { + ArrowArrayViewReset(data); +} + +/// \brief A unique_ptr-like base class for stack-allocatable objects +/// \tparam T The object type +template +class Unique { + public: + /// \brief Construct an invalid instance of T holding no resources + Unique() { init_pointer(&data_); } + + /// \brief Move and take ownership of data + Unique(T* data) { move_pointer(data, &data_); } + + /// \brief Move and take ownership of data wrapped by rhs + Unique(Unique&& rhs) : Unique(rhs.get()) {} + Unique& operator=(Unique&& rhs) { + reset(rhs.get()); + return *this; + } + + // These objects are not copyable + Unique(const Unique& rhs) = delete; + + /// \brief Get a pointer to the data owned by this object + T* get() noexcept { return &data_; } + const T* get() const noexcept { return &data_; } + + /// \brief Use the pointer operator to access fields of this object + T* operator->() noexcept { return &data_; } + const T* operator->() const noexcept { return &data_; } + + /// \brief Call data's release callback if valid + void reset() { release_pointer(&data_); } + + /// \brief Call data's release callback if valid and move ownership of the data + /// pointed to by data + void reset(T* data) { + reset(); + move_pointer(data, &data_); + } + + /// \brief Move ownership of this object to the data pointed to by out + void move(T* out) { move_pointer(&data_, out); } + + ~Unique() { reset(); } + + protected: + T data_; +}; + +/// @} + +} // namespace internal + +/// \defgroup nanoarrow_hpp-unique Unique object wrappers +/// +/// The Arrow C Data interface, the Arrow C Stream interface, and the +/// nanoarrow C library use stack-allocatable objects, some of which +/// require initialization or cleanup. +/// +/// @{ + +/// \brief Class wrapping a unique struct ArrowSchema +using UniqueSchema = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArray +using UniqueArray = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArrayStream +using UniqueArrayStream = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowBuffer +using UniqueBuffer = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowBitmap +using UniqueBitmap = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArrayView +using UniqueArrayView = internal::Unique; + +/// @} + +/// \defgroup nanoarrow_hpp-array-stream ArrayStream helpers +/// +/// These classes provide simple ArrowArrayStream implementations that +/// can be extended to help simplify the process of creating a valid +/// ArrowArrayStream implementation or used as-is for testing. +/// +/// @{ + +/// @brief Export an ArrowArrayStream from a standard C++ class +/// @tparam T A class with methods `int GetSchema(ArrowSchema*)`, `int +/// GetNext(ArrowArray*)`, and `const char* GetLastError()` +/// +/// This class allows a standard C++ class to be exported to a generic ArrowArrayStream +/// consumer by mapping C callback invocations to method calls on an instance of the +/// object whose lifecycle is owned by the ArrowArrayStream. See VectorArrayStream for +/// minimal useful example of this pattern. +/// +/// The methods must be accessible to the ArrayStreamFactory, either as public methods or +/// by declaring ArrayStreamFactory a friend. Implementors are encouraged (but +/// not required) to implement a ToArrayStream(ArrowArrayStream*) that creates a new +/// instance owned by the ArrowArrayStream and moves the relevant data to that instance. +/// +/// An example implementation might be: +/// +/// \code +/// class StreamImpl { +/// public: +/// // Public methods (e.g., constructor) used from C++ to initialize relevant data +/// +/// // Idiomatic exporter to move data + lifecycle responsibility to an instance +/// // managed by the ArrowArrayStream callbacks +/// void ToArrayStream(struct ArrowArrayStream* out) { +/// ArrayStreamFactory::InitArrayStream(new StreamImpl(...), out); +/// } +/// +/// private: +/// // Make relevant methods available to the ArrayStreamFactory +/// friend class ArrayStreamFactory; +/// +/// // Method implementations (called from C, not normally interacted with from C++) +/// int GetSchema(struct ArrowSchema* schema) { return ENOTSUP; } +/// int GetNext(struct ArrowArray* array) { return ENOTSUP; } +/// const char* GetLastError() { nullptr; } +/// }; +/// \endcode +/// +/// An example usage might be: +/// +/// \code +/// // Call constructor and/or public methods to initialize relevant data +/// StreamImpl impl; +/// +/// // Export to ArrowArrayStream after data are finalized +/// UniqueArrayStream stream; +/// impl.ToArrayStream(stream.get()); +/// \endcode +template +class ArrayStreamFactory { + public: + /// \brief Take ownership of instance and populate callbacks of out + static void InitArrayStream(T* instance, struct ArrowArrayStream* out) { + out->get_schema = &get_schema_wrapper; + out->get_next = &get_next_wrapper; + out->get_last_error = &get_last_error_wrapper; + out->release = &release_wrapper; + out->private_data = instance; + } + + private: + static int get_schema_wrapper(struct ArrowArrayStream* stream, + struct ArrowSchema* schema) { + return reinterpret_cast(stream->private_data)->GetSchema(schema); + } + + static int get_next_wrapper(struct ArrowArrayStream* stream, struct ArrowArray* array) { + return reinterpret_cast(stream->private_data)->GetNext(array); + } + + static const char* get_last_error_wrapper(struct ArrowArrayStream* stream) { + return reinterpret_cast(stream->private_data)->GetLastError(); + } + + static void release_wrapper(struct ArrowArrayStream* stream) { + delete reinterpret_cast(stream->private_data); + stream->release = nullptr; + stream->private_data = nullptr; + } +}; + +/// \brief An empty array stream +/// +/// This class can be constructed from an struct ArrowSchema and implements a default +/// get_next() method that always marks the output ArrowArray as released. +/// +/// DEPRECATED (0.4.0): Early versions of nanoarrow allowed subclasses to override +/// get_schema(), get_next(), and get_last_error(). This functionality will be removed +/// in a future release: use the pattern documented in ArrayStreamFactory to create +/// custom ArrowArrayStream implementations. +class EmptyArrayStream { + public: + /// \brief Create an EmptyArrayStream from an ArrowSchema + /// + /// Takes ownership of schema. + EmptyArrayStream(struct ArrowSchema* schema) : schema_(schema) { + ArrowErrorInit(&error_); + } + + /// \brief Export to ArrowArrayStream + void ToArrayStream(struct ArrowArrayStream* out) { + EmptyArrayStream* impl = new EmptyArrayStream(schema_.get()); + ArrayStreamFactory::InitArrayStream(impl, out); + } + + /// \brief Create an empty UniqueArrayStream from a struct ArrowSchema + /// + /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export an + /// EmptyArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema) { + UniqueArrayStream stream; + EmptyArrayStream(schema).ToArrayStream(stream.get()); + return stream; + } + + virtual ~EmptyArrayStream() {} + + protected: + UniqueSchema schema_; + struct ArrowError error_; + + void MakeStream(struct ArrowArrayStream* stream) { ToArrayStream(stream); } + + virtual int get_schema(struct ArrowSchema* schema) { + return ArrowSchemaDeepCopy(schema_.get(), schema); + } + + virtual int get_next(struct ArrowArray* array) { + array->release = nullptr; + return NANOARROW_OK; + } + + virtual const char* get_last_error() { return error_.message; } + + private: + friend class ArrayStreamFactory; + + int GetSchema(struct ArrowSchema* schema) { return get_schema(schema); } + + int GetNext(struct ArrowArray* array) { return get_next(array); } + + const char* GetLastError() { return get_last_error(); } +}; + +/// \brief Implementation of an ArrowArrayStream backed by a vector of UniqueArray objects +class VectorArrayStream { + public: + /// \brief Create a VectorArrayStream from an ArrowSchema + vector of UniqueArray + /// + /// Takes ownership of schema and moves arrays if possible. + VectorArrayStream(struct ArrowSchema* schema, std::vector arrays) + : offset_(0), schema_(schema), arrays_(std::move(arrays)) {} + + /// \brief Create a one-shot VectorArrayStream from an ArrowSchema + ArrowArray + /// + /// Takes ownership of schema and array. + VectorArrayStream(struct ArrowSchema* schema, struct ArrowArray* array) + : offset_(0), schema_(schema) { + arrays_.emplace_back(array); + } + + /// \brief Export to ArrowArrayStream + void ToArrayStream(struct ArrowArrayStream* out) { + VectorArrayStream* impl = new VectorArrayStream(schema_.get(), std::move(arrays_)); + ArrayStreamFactory::InitArrayStream(impl, out); + } + + /// \brief Create a UniqueArrowArrayStream from an existing array + /// + /// DEPRECATED (0.4.0): Use the constructors + ToArrayStream() to export a + /// VectorArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, + struct ArrowArray* array) { + UniqueArrayStream stream; + VectorArrayStream(schema, array).ToArrayStream(stream.get()); + return stream; + } + + /// \brief Create a UniqueArrowArrayStream from existing arrays + /// + /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export a + /// VectorArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, + std::vector arrays) { + UniqueArrayStream stream; + VectorArrayStream(schema, std::move(arrays)).ToArrayStream(stream.get()); + return stream; + } + + private: + int64_t offset_; + UniqueSchema schema_; + std::vector arrays_; + + friend class ArrayStreamFactory; + + int GetSchema(struct ArrowSchema* schema) { + return ArrowSchemaDeepCopy(schema_.get(), schema); + } + + int GetNext(struct ArrowArray* array) { + if (offset_ < static_cast(arrays_.size())) { + arrays_[offset_++].move(array); + } else { + array->release = nullptr; + } + + return NANOARROW_OK; + } + + const char* GetLastError() { return ""; } +}; + +/// @} + +} // namespace nanoarrow + +#endif