From fbc9f5f7749d86678b2b65f5e2f8dca895456581 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 19 Feb 2024 12:57:08 -0600 Subject: [PATCH 01/39] Update nanoarrow vendored files to nanoarrow 0.4.0 --- apis/r/src/nanoarrow.c | 766 ++++++++++++++++++++----------- apis/r/src/nanoarrow.h | 925 +++++++++++++++++++++++++++++++------- apis/r/src/rutilities.cpp | 4 +- 3 files changed, 1278 insertions(+), 417 deletions(-) diff --git a/apis/r/src/nanoarrow.c b/apis/r/src/nanoarrow.c index 1d31884b19..d9a8d7d905 100644 --- a/apis/r/src/nanoarrow.c +++ b/apis/r/src/nanoarrow.c @@ -28,7 +28,7 @@ const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } -int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { +ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { if (error == NULL) { return NANOARROW_OK; } @@ -49,12 +49,13 @@ int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { } } -const char* ArrowErrorMessage(struct ArrowError* error) { return error->message; } - void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 1; layout->element_size_bits[1] = 0; @@ -66,43 +67,53 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 0; break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_LARGE_LIST: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; break; + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + case NANOARROW_TYPE_BOOL: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 1; break; case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 8; break; case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_HALF_FLOAT: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 16; break; case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; case NANOARROW_TYPE_INTERVAL_MONTHS: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; @@ -110,49 +121,61 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_INTERVAL_DAY_TIME: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 64; break; case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 128; break; case NANOARROW_TYPE_DECIMAL256: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->element_size_bits[1] = 256; break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; break; case NANOARROW_TYPE_DENSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_SPARSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; break; case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; case NANOARROW_TYPE_LARGE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; break; default: @@ -169,11 +192,15 @@ void ArrowFree(void* ptr) { free(ptr); } static uint8_t* ArrowBufferAllocatorMallocReallocate( struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(old_size); return (uint8_t*)ArrowRealloc(ptr, new_size); } static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(size); ArrowFree(ptr); } @@ -187,6 +214,10 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { static uint8_t* ArrowBufferAllocatorNeverReallocate( struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(ptr); + NANOARROW_UNUSED(old_size); + NANOARROW_UNUSED(new_size); return NULL; } @@ -224,7 +255,7 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( #include "nanoarrow.h" -static void ArrowSchemaRelease(struct ArrowSchema* schema) { +static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); if (schema->name != NULL) ArrowFree((void*)schema->name); if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); @@ -236,7 +267,7 @@ static void ArrowSchemaRelease(struct ArrowSchema* schema) { for (int64_t i = 0; i < schema->n_children; i++) { if (schema->children[i] != NULL) { if (schema->children[i]->release != NULL) { - schema->children[i]->release(schema->children[i]); + ArrowSchemaRelease(schema->children[i]); } ArrowFree(schema->children[i]); @@ -251,7 +282,7 @@ static void ArrowSchemaRelease(struct ArrowSchema* schema) { // release() callback. if (schema->dictionary != NULL) { if (schema->dictionary->release != NULL) { - schema->dictionary->release(schema->dictionary); + ArrowSchemaRelease(schema->dictionary); } ArrowFree(schema->dictionary); @@ -265,7 +296,8 @@ static void ArrowSchemaRelease(struct ArrowSchema* schema) { schema->release = NULL; } -static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { +// -- changed for tiledb-r static +const char* ArrowSchemaFormatTemplate(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_UNINITIALIZED: return NULL; @@ -332,7 +364,8 @@ static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { } } -static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, +// -- changed for tiledb-r static +int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, enum ArrowType type) { switch (type) { case NANOARROW_TYPE_LIST: @@ -373,7 +406,7 @@ void ArrowSchemaInit(struct ArrowSchema* schema) { schema->children = NULL; schema->dictionary = NULL; schema->private_data = NULL; - schema->release = &ArrowSchemaRelease; + schema->release = &ArrowSchemaReleaseInternal; } ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { @@ -409,7 +442,7 @@ ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowTyp int result = ArrowSchemaSetType(schema, type); if (result != NANOARROW_OK) { - schema->release(schema); + ArrowSchemaRelease(schema); return result; } @@ -685,13 +718,13 @@ ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { return NANOARROW_OK; } -ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, struct ArrowSchema* schema_out) { ArrowSchemaInit(schema_out); int result = ArrowSchemaSetFormat(schema_out, schema->format); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } @@ -699,26 +732,26 @@ ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, result = ArrowSchemaSetName(schema_out, schema->name); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaSetMetadata(schema_out, schema->metadata); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } for (int64_t i = 0; i < schema->n_children; i++) { result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } } @@ -726,13 +759,13 @@ ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, if (schema->dictionary != NULL) { result = ArrowSchemaAllocateDictionary(schema_out); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); if (result != NANOARROW_OK) { - schema_out->release(schema_out); + ArrowSchemaRelease(schema_out); return result; } } @@ -814,8 +847,7 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, // decimal case 'd': if (format[1] != ':' || format[2] == '\0') { - ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'", - format + 3); + ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); return EINVAL; } @@ -1160,13 +1192,15 @@ static ArrowErrorCode ArrowSchemaViewValidateNChildren( for (int64_t i = 0; i < schema_view->schema->n_children; i++) { child = schema_view->schema->children[i]; if (child == NULL) { - ArrowErrorSet(error, "Expected valid schema at schema->children[%d] but found NULL", - i); + ArrowErrorSet(error, + "Expected valid schema at schema->children[%ld] but found NULL", + (long)i); return EINVAL; } else if (child->release == NULL) { ArrowErrorSet( error, - "Expected valid schema at schema->children[%d] but found a released schema", i); + "Expected valid schema at schema->children[%ld] but found a released schema", + (long)i); return EINVAL; } } @@ -1305,7 +1339,8 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie } ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - struct ArrowSchema* schema, struct ArrowError* error) { + const struct ArrowSchema* schema, + struct ArrowError* error) { if (schema == NULL) { ArrowErrorSet(error, "Expected non-NULL schema"); return EINVAL; @@ -1333,8 +1368,7 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, } const char* format_end_out; - ArrowErrorCode result = - ArrowSchemaViewParse(schema_view, format, &format_end_out, error); + int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); if (result != NANOARROW_OK) { if (error != NULL) { @@ -1377,10 +1411,12 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, schema_view->extension_name = ArrowCharView(NULL); schema_view->extension_metadata = ArrowCharView(NULL); - ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:name"), - &schema_view->extension_name); - ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:metadata"), - &schema_view->extension_metadata); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:name"), + &schema_view->extension_name)); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:metadata"), + &schema_view->extension_metadata)); return NANOARROW_OK; } @@ -1413,7 +1449,7 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi } } -// Helper for bookeeping to emulate sprintf()-like behaviour spread +// Helper for bookkeeping to emulate sprintf()-like behaviour spread // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { @@ -1431,7 +1467,7 @@ static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, } } -int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, char recursive) { if (schema == NULL) { return snprintf(out, n, "[invalid: pointer is null]"); @@ -1568,7 +1604,9 @@ int64_t ArrowMetadataSizeOf(const char* metadata) { struct ArrowMetadataReader reader; struct ArrowStringView key; struct ArrowStringView value; - ArrowMetadataReaderInit(&reader, metadata); + if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { + return 0; + } int64_t size = sizeof(int32_t); while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { @@ -1584,7 +1622,7 @@ static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, struct ArrowMetadataReader reader; struct ArrowStringView existing_key; struct ArrowStringView existing_value; - ArrowMetadataReaderInit(&reader, metadata); + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == NANOARROW_OK) { @@ -1611,7 +1649,10 @@ ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringVie char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { struct ArrowStringView value = ArrowCharView(NULL); - ArrowMetadataGetValue(metadata, key, &value); + if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { + return 0; + } + return value.data != NULL; } @@ -1749,7 +1790,7 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, #include "nanoarrow.h" // -- changed for tiledb-r static -void ArrowArrayRelease(struct ArrowArray* array) { +void ArrowArrayReleaseInternal(struct ArrowArray* array) { // Release buffers held by this array struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; @@ -1767,7 +1808,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { for (int64_t i = 0; i < array->n_children; i++) { if (array->children[i] != NULL) { if (array->children[i]->release != NULL) { - array->children[i]->release(array->children[i]); + ArrowArrayRelease(array->children[i]); } ArrowFree(array->children[i]); @@ -1782,7 +1823,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { // release() callback. if (array->dictionary != NULL) { if (array->dictionary->release != NULL) { - array->dictionary->release(array->dictionary); + ArrowArrayRelease(array->dictionary); } ArrowFree(array->dictionary); @@ -1794,7 +1835,7 @@ void ArrowArrayRelease(struct ArrowArray* array) { // -- changed for tiledb-r static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, - enum ArrowType storage_type) { + enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: @@ -1861,7 +1902,7 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, array->buffers = NULL; array->children = NULL; array->dictionary = NULL; - array->release = &ArrowArrayRelease; + array->release = &ArrowArrayReleaseInternal; array->private_data = NULL; struct ArrowArrayPrivateData* private_data = @@ -1883,7 +1924,7 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, int result = ArrowArraySetStorageType(array, storage_type); if (result != NANOARROW_OK) { - array->release(array); + ArrowArrayRelease(array); return result; } @@ -1894,26 +1935,45 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, return NANOARROW_OK; } -static ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, - struct ArrowArrayView* array_view, - struct ArrowError* error) { - ArrowArrayInitFromType(array, array_view->storage_type); +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; - int result = ArrowArrayAllocateChildren(array, array_view->n_children); - if (result != NANOARROW_OK) { - array->release(array); - return result; + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + } } - private_data->layout = array_view->layout; + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } - for (int64_t i = 0; i < array_view->n_children; i++) { - int result = - ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); if (result != NANOARROW_OK) { - array->release(array); + ArrowArrayRelease(array); return result; } } @@ -1922,7 +1982,7 @@ static ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, } ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - struct ArrowSchema* schema, + const struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); @@ -1957,9 +2017,7 @@ ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_ch return ENOMEM; } - for (int64_t i = 0; i < n_children; i++) { - array->children[i] = NULL; - } + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); for (int64_t i = 0; i < n_children; i++) { array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); @@ -2027,6 +2085,16 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v ArrowArrayViewInitFromType(array_view, private_data->storage_type); array_view->layout = private_data->layout; array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); if (result != NANOARROW_OK) { @@ -2042,6 +2110,20 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v } } + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + return NANOARROW_OK; } @@ -2103,7 +2185,7 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { case NANOARROW_TYPE_LARGE_BINARY: case NANOARROW_TYPE_LARGE_STRING: if (ArrowArrayBuffer(array, 2)->data == NULL) { - ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0)); } break; default: @@ -2114,6 +2196,10 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + return NANOARROW_OK; } @@ -2121,46 +2207,17 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } for (int64_t i = 0; i < array->n_children; i++) { ArrowArrayFlushInternalPointers(array->children[i]); } -} - -static ArrowErrorCode ArrowArrayCheckInternalBufferSizes( - struct ArrowArray* array, struct ArrowArrayView* array_view, char set_length, - struct ArrowError* error) { - if (set_length) { - ArrowArrayViewSetLength(array_view, array->offset + array->length); - } - - for (int64_t i = 0; i < array->n_buffers; i++) { - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - array->null_count == 0 && array->buffers[i] == NULL) { - continue; - } - int64_t expected_size = array_view->buffer_views[i].size_bytes; - int64_t actual_size = ArrowArrayBuffer(array, i)->size_bytes; - - if (actual_size < expected_size) { - ArrowErrorSet( - error, - "Expected buffer %d to size >= %ld bytes but found buffer with %ld bytes", - (int)i, (long)expected_size, (long)actual_size); - return EINVAL; - } - } - - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayCheckInternalBufferSizes( - array->children[i], array_view->children[i], set_length, error)); + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); } - - return NANOARROW_OK; } ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, @@ -2170,7 +2227,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, // in some implementations (at least one version of Arrow C++ at the time this // was added). Only do this fix if we can assume CPU data access. if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { - NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array)); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); } // Make sure the value we get with array->buffers[i] is set to the actual @@ -2181,44 +2238,11 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, return NANOARROW_OK; } - // Check buffer sizes to make sure we are not sending an ArrowArray - // into the wild that is going to segfault + // For validation, initialize an ArrowArrayView with our known buffer sizes struct ArrowArrayView array_view; - - NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); - - // Check buffer sizes once without using internal buffer data since - // ArrowArrayViewSetArray() assumes that all the buffers are long enough - // and issues invalid reads on offset buffers if they are not - int result = ArrowArrayCheckInternalBufferSizes(array, &array_view, 1, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - if (validation_level == NANOARROW_VALIDATION_LEVEL_MINIMAL) { - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; - } - - result = ArrowArrayViewSetArray(&array_view, array, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - result = ArrowArrayCheckInternalBufferSizes(array, &array_view, 0, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - - if (validation_level == NANOARROW_VALIDATION_LEVEL_DEFAULT) { - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; - } - - result = ArrowArrayViewValidateFull(&array_view, error); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); ArrowArrayViewReset(&array_view); return result; } @@ -2265,8 +2289,23 @@ ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, return NANOARROW_OK; } +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - struct ArrowSchema* schema, + const struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowSchemaView schema_view; int result = ArrowSchemaViewInit(&schema_view, schema, error); @@ -2279,6 +2318,7 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); ArrowArrayViewReset(array_view); return result; } @@ -2292,6 +2332,21 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, } } + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); @@ -2300,8 +2355,8 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, } memset(array_view->union_type_id_map, -1, 256); - int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, - array_view->union_type_id_map + 128); + int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, + array_view->union_type_id_map + 128); for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { int8_t type_id = array_view->union_type_id_map[128 + child_index]; array_view->union_type_id_map[type_id] = child_index; @@ -2323,6 +2378,11 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { ArrowFree(array_view->children); } + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + if (array_view->union_type_id_map != NULL) { ArrowFree(array_view->union_type_id_map); } @@ -2331,9 +2391,8 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { } void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; - array_view->buffer_views[i].data.data = NULL; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: @@ -2377,57 +2436,217 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) } } -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - struct ArrowArray* array, - struct ArrowError* error) { +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { array_view->array = array; - - // Check length and offset - if (array->offset < 0) { - ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", - (long)array->offset); - return EINVAL; - } - - if (array->length < 0) { - ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", - (long)array->length); - return EINVAL; - } - - // First pass setting lengths that do not depend on the data buffer - ArrowArrayViewSetLength(array_view, array->offset + array->length); + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; int64_t buffers_required = 0; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } buffers_required++; - // If the null_count is 0, the validity buffer can be NULL - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - array->null_count == 0 && array->buffers[i] == NULL) { + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; } - - array_view->buffer_views[i].data.data = array->buffers[i]; } + // Check the number of buffers if (buffers_required != array->n_buffers) { ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", (int)buffers_required, (int)array->n_buffers); return EINVAL; } + // Check number of children if (array_view->n_children != array->n_children) { ArrowErrorSet(error, "Expected %ld children but found %ld children", (long)array_view->n_children, (long)array->n_children); return EINVAL; } - // Check child sizes and calculate sizes that depend on data in the array buffers + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + if (array_view->length < 0) { + ArrowErrorSet(error, "Expected length >= 0 but found length %ld", + (long)array_view->length); + return EINVAL; + } + + if (array_view->offset < 0) { + ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", + (long)array_view->offset); + return EINVAL; + } + + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (int)i, + (long)min_buffer_size_bytes, + (long)array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", + ArrowTypeString(array_view->storage_type), + (long)array_view->n_children); + return EINVAL; + } + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)(child_min_length), + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %ld but " + "found array with length %ld", + (long)child_min_length, (long)array_view->children[0]->length); + return EINVAL; + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + int64_t first_offset; int64_t last_offset; switch (array_view->storage_type) { @@ -2441,11 +2660,22 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int32[array->offset + array->length]; - array_view->buffer_views[2].size_bytes = last_offset; + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } } break; + case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { @@ -2456,34 +2686,38 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int64[array->offset + array->length]; - array_view->buffer_views[2].size_bytes = last_offset; + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } } break; + case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array_view->n_children; i++) { - if (array->children[i]->length < (array->offset + array->length)) { + if (array_view->children[i]->length < offset_plus_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", - (int)(i + 1), (long)(array->offset + array->length), - (long)array->children[i]->length); + (int)(i + 1), (long)offset_plus_length, + (long)array_view->children[i]->length); return EINVAL; } } break; - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: { - const char* type_name = - array_view->storage_type == NANOARROW_TYPE_LIST ? "list" : "map"; - if (array->n_children != 1) { - ArrowErrorSet(error, "Expected 1 child of %s array but found %d child arrays", - type_name, (int)array->n_children); - return EINVAL; - } + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { @@ -2492,27 +2726,20 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int32[array->offset + array->length]; - if (array->children[0]->length < last_offset) { + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, - "Expected child of %s array with length >= %ld but found array with " + "Expected child of %s array to have length >= %ld but found array with " "length %ld", - type_name, (long)last_offset, (long)array->children[0]->length); + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->children[0]->length); return EINVAL; } } break; - } - case NANOARROW_TYPE_LARGE_LIST: - if (array->n_children != 1) { - ArrowErrorSet(error, - "Expected 1 child of large list array but found %d child arrays", - (int)array->n_children); - return EINVAL; - } + case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { @@ -2521,49 +2748,61 @@ ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, return EINVAL; } - last_offset = - array_view->buffer_views[1].data.as_int64[array->offset + array->length]; - if (array->children[0]->length < last_offset) { + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, - "Expected child of large list array with length >= %ld but found array " + "Expected child of large list array to have length >= %ld but found array " "with length %ld", - (long)last_offset, (long)array->children[0]->length); + (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - if (array->n_children != 1) { - ArrowErrorSet(error, - "Expected 1 child of fixed-size array but found %d child arrays", - (int)array->n_children); - return EINVAL; - } - - last_offset = - (array->offset + array->length) * array_view->layout.child_size_elements; - if (array->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of fixed-size list array with length >= %ld but found array " - "with length %ld", - (long)last_offset, (long)array->children[0]->length); - return EINVAL; - } - break; default: break; } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( - ArrowArrayViewSetArray(array_view->children[i], array->children[i], error)); + ArrowArrayViewValidateDefault(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); } return NANOARROW_OK; } +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + return NANOARROW_OK; +} + static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, struct ArrowError* error) { if (view.size_bytes <= (int64_t)sizeof(int32_t)) { @@ -2571,10 +2810,8 @@ static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { - int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1]; - if (diff < 0) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", - (long)i, (long)diff); + if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } @@ -2589,10 +2826,8 @@ static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { - int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1]; - if (diff < 0) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", - (long)i, (long)diff); + if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } @@ -2635,9 +2870,9 @@ static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, return NANOARROW_OK; } -ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error) { - for (int i = 0; i < 3; i++) { +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: if (array_view->layout.element_size_bits[i] == 32) { @@ -2655,17 +2890,18 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { - // Check that we have valid type ids. if (array_view->union_type_id_map == NULL) { - // If the union_type_id map is NULL - // (e.g., when using ArrowArrayInitFromType() + ArrowArrayAllocateChildren() - // + ArrowArrayFinishBuilding()), we don't have enough information to validate - // this buffer (GH-178). + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( array_view->union_type_id_map, array_view->n_children, array_view->n_children)) { - NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(array_view->buffer_views[0], 0, - array_view->n_children - 1, error)); + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); } else { NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], array_view->union_type_id_map + 128, @@ -2676,27 +2912,53 @@ ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && array_view->union_type_id_map != NULL) { // Check that offsets refer to child elements that actually exist - for (int64_t i = 0; i < array_view->array->length; i++) { + for (int64_t i = 0; i < array_view->length; i++) { int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); - int64_t child_length = array_view->array->children[child_id]->length; + int64_t child_length = array_view->children[child_id]->length; if (offset < 0 || offset > child_length) { ArrowErrorSet( error, "[%ld] Expected union offset for child id %d to be between 0 and %ld but " "found offset value %ld", - (long)i, (int)child_id, (long)child_length, offset); + (long)i, (int)child_id, (long)child_length, (long)offset); return EINVAL; } } } + // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); } + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); + // TODO: validate the indices + } + return NANOARROW_OK; } + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -2756,6 +3018,7 @@ static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, static const char* ArrowBasicArrayStreamGetLastError( struct ArrowArrayStream* array_stream) { + NANOARROW_UNUSED(array_stream); return NULL; } @@ -2768,12 +3031,12 @@ static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) (struct BasicArrayStreamPrivate*)array_stream->private_data; if (private_data->schema.release != NULL) { - private_data->schema.release(&private_data->schema); + ArrowSchemaRelease(&private_data->schema); } for (int64_t i = 0; i < private_data->n_arrays; i++) { if (private_data->arrays[i].release != NULL) { - private_data->arrays[i].release(&private_data->arrays[i]); + ArrowArrayRelease(&private_data->arrays[i]); } } @@ -2787,8 +3050,9 @@ static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, struct ArrowSchema* schema, int64_t n_arrays) { - struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)ArrowMalloc( - sizeof(struct BasicArrayStreamPrivate)); + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); if (private_data == NULL) { return ENOMEM; } @@ -2827,7 +3091,7 @@ void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_ ArrowArrayMove(array, &private_data->arrays[i]); } -ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, struct ArrowError* error) { struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; diff --git a/apis/r/src/nanoarrow.h b/apis/r/src/nanoarrow.h index 90ce2dc06a..331da29837 100644 --- a/apis/r/src/nanoarrow.h +++ b/apis/r/src/nanoarrow.h @@ -19,9 +19,9 @@ #define NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 2 +#define NANOARROW_VERSION_MINOR 4 #define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.2.0-SNAPSHOT" +#define NANOARROW_VERSION "0.4.0-SNAPSHOT" #define NANOARROW_VERSION_INT \ (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ @@ -55,6 +55,11 @@ +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + #ifdef __cplusplus extern "C" { #endif @@ -157,25 +162,6 @@ struct ArrowArrayStream { #endif // ARROW_C_STREAM_INTERFACE #endif // ARROW_FLAG_DICTIONARY_ORDERED -/// \brief Move the contents of src into dst and set src->release to NULL -static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { - memcpy(dst, src, sizeof(struct ArrowSchema)); - src->release = NULL; -} - -/// \brief Move the contents of src into dst and set src->release to NULL -static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { - memcpy(dst, src, sizeof(struct ArrowArray)); - src->release = NULL; -} - -/// \brief Move the contents of src into dst and set src->release to NULL -static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, - struct ArrowArrayStream* dst) { - memcpy(dst, src, sizeof(struct ArrowArrayStream)); - src->release = NULL; -} - /// @} // Utility macros @@ -191,6 +177,58 @@ static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, #define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ + NAME, __FILE__, __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +// For checking ArrowErrorSet() calls for valid printf format strings/arguments +// If using mingw's c99-compliant printf, we need a different format-checking attribute +#if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ + __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) +#elif defined(__GNUC__) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) +#else +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +// For checking calls to functions that return ArrowErrorCode +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE __attribute__((warn_unused_result)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1700) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE _Check_return_ +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#endif + +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +#define NANOARROW_UNUSED(x) (void)(x) + /// \brief Return code for success. /// \ingroup nanoarrow-errors #define NANOARROW_OK 0 @@ -199,11 +237,194 @@ static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, /// \ingroup nanoarrow-errors typedef int ArrowErrorCode; +#if defined(NANOARROW_DEBUG) +#define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode +#endif + +/// \brief Error type containing a UTF-8 encoded message. +/// \ingroup nanoarrow-errors +struct ArrowError { + /// \brief A character buffer with space for an error message. + char message[1024]; +}; + +/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error != NULL) { + error->message[0] = '\0'; + } +} + +/// \brief Get the contents of an error +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. +static inline const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} + +/// \brief Set the contents of an error from an existing null-terminated string +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorSetString(struct ArrowError* error, const char* src) { + if (error == NULL) { + return; + } + + int64_t src_len = strlen(src); + if (src_len >= ((int64_t)sizeof(error->message))) { + memcpy(error->message, src, sizeof(error->message) - 1); + error->message[sizeof(error->message) - 1] = '\0'; + } else { + memcpy(error->message, src, src_len); + error->message[src_len] = '\0'; + } +} + /// \brief Check the result of an expression and return it if not NANOARROW_OK /// \ingroup nanoarrow-errors #define NANOARROW_RETURN_NOT_OK(EXPR) \ _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when calling +/// a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf(stderr, "%s failed with code %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ + __FILE__, (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), +/// print a message to stderr and abort. If nanoarrow was built in release mode, +/// this statement has no effect. You can customize fatal error behaviour +/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h +/// This macro is provided as a convenience for users and is not used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) + +#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ + do { \ + if (!(EXPR)) NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ + } while (0) + +#define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) EXPR +#define NANOARROW_DCHECK(EXPR) +#endif + +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; +} + +static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + NANOARROW_DCHECK(schema != NULL); + schema->release(schema); + NANOARROW_DCHECK(schema->release == NULL); +} + +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; +} + +static inline void ArrowArrayRelease(struct ArrowArray* array) { + NANOARROW_DCHECK(array != NULL); + array->release(array); + NANOARROW_DCHECK(array->release == NULL); +} + +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; +} + +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + + const char* value = array_stream->get_last_error(array_stream); + if (value == NULL) { + return ""; + } else { + return value; + } +} + +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_schema(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_next(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + array_stream->release(array_stream); + NANOARROW_DCHECK(array_stream->release == NULL); +} + static char _ArrowIsLittleEndian(void) { uint32_t check = 1; char first_byte; @@ -263,6 +484,8 @@ enum ArrowType { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + static inline const char* ArrowTypeString(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_NA: @@ -381,6 +604,8 @@ enum ArrowValidationLevel { /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: @@ -407,6 +632,14 @@ enum ArrowBufferType { NANOARROW_BUFFER_TYPE_DATA }; +/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \ingroup nanoarrow-array-view +/// +/// All currently supported types have 3 buffers or fewer; however, future types +/// may involve a variable number of buffers (e.g., string view). These buffers +/// will be represented by separate members of the ArrowArrayView or ArrowLayout. +#define NANOARROW_MAX_FIXED_BUFFERS 3 + /// \brief An non-owning view of a string /// \ingroup nanoarrow-utils struct ArrowStringView { @@ -423,6 +656,8 @@ struct ArrowStringView { /// \brief Return a view of a const C string /// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + static inline struct ArrowStringView ArrowCharView(const char* value) { struct ArrowStringView out; @@ -436,26 +671,28 @@ static inline struct ArrowStringView ArrowCharView(const char* value) { return out; } +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; +}; + /// \brief An non-owning view of a buffer /// \ingroup nanoarrow-utils struct ArrowBufferView { /// \brief A pointer to the start of the buffer /// /// If size_bytes is 0, this value may be NULL. - union { - const void* data; - const int8_t* as_int8; - const uint8_t* as_uint8; - const int16_t* as_int16; - const uint16_t* as_uint16; - const int32_t* as_int32; - const uint32_t* as_uint32; - const int64_t* as_int64; - const uint64_t* as_uint64; - const double* as_double; - const float* as_float; - const char* as_char; - } data; + union ArrowBufferViewData data; /// \brief The size of the buffer in bytes int64_t size_bytes; @@ -515,10 +752,13 @@ struct ArrowBitmap { /// the length and offset of the array. struct ArrowLayout { /// \brief The function of each buffer - enum ArrowBufferType buffer_type[3]; + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The size of an element each buffer or 0 if this size is variable or unknown - int64_t element_size_bits[3]; + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of elements in the child array per element in this array for a /// fixed-size list @@ -531,11 +771,22 @@ struct ArrowLayout { /// This data structure provides access to the values contained within /// an ArrowArray with fields provided in a more readily-extractible /// form. You can re-use an ArrowArrayView for multiple ArrowArrays -/// with the same storage type, or use it to represent a hypothetical -/// ArrowArray that does not exist yet. +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. struct ArrowArrayView { - /// \brief The underlying ArrowArray or NULL if it has not been set - struct ArrowArray* array; + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + const struct ArrowArray* array; + + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; /// \brief The type used to store values in this array /// @@ -549,7 +800,7 @@ struct ArrowArrayView { struct ArrowLayout layout; /// \brief This Array's buffers as ArrowBufferView objects - struct ArrowBufferView buffer_views[3]; + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of children of this view int64_t n_children; @@ -557,6 +808,9 @@ struct ArrowArrayView { /// \brief Pointers to views of this array's children struct ArrowArrayView** children; + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + /// \brief Union type id to child index mapping /// /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer @@ -574,12 +828,12 @@ struct ArrowArrayPrivateData { struct ArrowBitmap bitmap; // Holder for additional buffers as required - struct ArrowBuffer buffers[2]; + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; // The array of pointers to buffers. This must be updated after a sequence // of appends to synchronize its values with the actual buffer addresses // (which may have ben reallocated uring that time) - const void* buffer_data[3]; + const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown enum ArrowType storage_type; @@ -593,6 +847,29 @@ struct ArrowArrayPrivateData { int8_t union_type_id_is_child_index; }; +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit(struct ArrowInterval* interval, + enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + /// \brief A representation of a fixed-precision decimal number /// \ingroup nanoarrow-utils /// @@ -642,19 +919,20 @@ static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwid /// This does not check if the decimal's precision sufficiently small to fit /// within the signed 64-bit integer range (A precision less than or equal /// to 18 is sufficiently small). -static inline int64_t ArrowDecimalGetIntUnsafe(struct ArrowDecimal* decimal) { +static inline int64_t ArrowDecimalGetIntUnsafe(const struct ArrowDecimal* decimal) { return (int64_t)decimal->words[decimal->low_word_index]; } /// \brief Copy the bytes of this decimal into a sufficiently large buffer /// \ingroup nanoarrow-utils -static inline void ArrowDecimalGetBytes(struct ArrowDecimal* decimal, uint8_t* out) { +static inline void ArrowDecimalGetBytes(const struct ArrowDecimal* decimal, + uint8_t* out) { memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); } /// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise /// \ingroup nanoarrow-utils -static inline int64_t ArrowDecimalSign(struct ArrowDecimal* decimal) { +static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); } @@ -722,7 +1000,6 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, #define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) #define ArrowNanoarrowVersionInt \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) -#define ArrowErrorMessage NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorMessage) #define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) #define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) #define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) @@ -776,6 +1053,10 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) #define ArrowArrayInitFromSchema \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) #define ArrowArrayAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) #define ArrowArrayAllocateDictionary \ @@ -794,12 +1075,16 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) #define ArrowArrayViewAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) #define ArrowArrayViewSetLength \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) #define ArrowArrayViewSetArray \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) -#define ArrowArrayViewValidateFull \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidateFull) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) #define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) #define ArrowBasicArrayStreamInit \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) @@ -860,27 +1145,84 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( /// @} +/// \brief Move the contents of an src ArrowSchema into dst and set src->release to NULL +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst); + +/// \brief Call the release callback of an ArrowSchema +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaRelease(struct ArrowSchema* schema); + +/// \brief Move the contents of an src ArrowArray into dst and set src->release to NULL +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst); + +/// \brief Call the release callback of an ArrowArray +static inline void ArrowArrayRelease(struct ArrowArray* array); + +/// \brief Move the contents of an src ArrowArrayStream into dst and set src->release to +/// NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_schema callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowError* error); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowError* error); + +/// \brief Call the get_next callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this function never returns NULL (i.e., its +/// result is safe to use in printf-style error formatters). Null values from the +/// original callback are reported as "". +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream); + +/// \brief Call the release callback of an ArrowArrayStream +static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream); + /// \defgroup nanoarrow-errors Error handling /// /// Functions generally return an errno-compatible error code; functions that /// need to communicate more verbose error information accept a pointer /// to an ArrowError. This can be stack or statically allocated. The /// content of the message is undefined unless an error code has been -/// returned. +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the +/// ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and +/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use +/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms +/// for memory management and error propgagtion. /// /// @{ -/// \brief Error type containing a UTF-8 encoded message. -struct ArrowError { - /// \brief A character buffer with space for an error message. - char message[1024]; -}; - -/// \brief Set the contents of an error using printf syntax -ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...); - -/// \brief Get the contents of an error -const char* ArrowErrorMessage(struct ArrowError* error); +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. +NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet(struct ArrowError* error, + const char* fmt, ...); /// @} @@ -929,7 +1271,7 @@ ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowTyp /// and returns the number of characters required for the output if /// n were sufficiently large. If recursive is non-zero, the result will /// also include children. -int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, char recursive); /// \brief Set the format field of a schema from an ArrowType @@ -991,7 +1333,7 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowTyp /// \brief Make a (recursive) copy of a schema /// /// Allocates and copies fields of schema into schema_out. -ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, struct ArrowSchema* schema_out); /// \brief Copy format into schema->format @@ -1103,10 +1445,10 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, /// Contains more readily extractable values than a raw ArrowSchema. /// Clients can stack or statically allocate this structure but are /// encouraged to use the provided getters to ensure forward -/// compatiblity. +/// compatibility. struct ArrowSchemaView { /// \brief A pointer to the schema represented by this view - struct ArrowSchema* schema; + const struct ArrowSchema* schema; /// \brief The data type represented by the schema /// @@ -1189,7 +1531,8 @@ struct ArrowSchemaView { /// \brief Initialize an ArrowSchemaView ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - struct ArrowSchema* schema, struct ArrowError* error); + const struct ArrowSchema* schema, + struct ArrowError* error); /// @} @@ -1333,6 +1676,14 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l /// \brief Count true values in a bitmap static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out); + +/// \brief Extract int32 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out); + /// \brief Initialize an ArrowBitmap /// /// Initialize the builder's buffer, empty its cache, and reset the size to zero @@ -1410,9 +1761,17 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, /// Caller is responsible for calling the array->release callback if /// NANOARROW_OK is returned. ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - struct ArrowSchema* schema, + const struct ArrowSchema* schema, struct ArrowError* error); +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error); + /// \brief Allocate the array->children array /// /// Includes the memory for each child struct ArrowArray, @@ -1500,32 +1859,43 @@ static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, /// \brief Append a string of bytes to an array /// /// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., -/// the underlying array is not a binary, string, large binary, large string, -/// or fixed-size binary array, or value is the wrong size for a fixed-size -/// binary array). +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// binary, string, large binary, large string, or fixed-size binary array, or value is +/// the wrong size for a fixed-size binary array). static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, struct ArrowBufferView value); /// \brief Append a string value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., -/// the underlying array is not a string or large string array). +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// string or large string array). static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, struct ArrowStringView value); +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + const struct ArrowInterval* value); + /// \brief Append a decimal value to an array /// /// Returns NANOARROW_OK if array is a decimal array with the appropriate /// bitwidth or EINVAL otherwise. static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - struct ArrowDecimal* value); + const struct ArrowDecimal* value); /// \brief Finish a nested array element /// /// Appends a non-null element to the array based on the first child's current -/// length. Returns NANOARROW_OK if the item was successfully added or EINVAL +/// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW +/// if the child of a list or map array would exceed INT_MAX elements, or EINVAL /// if the underlying storage type is not a struct, list, large list, or fixed-size /// list, or if there was an attempt to add a struct or fixed-size list element where the /// length of the child array(s) did not match the expected length. @@ -1561,7 +1931,7 @@ ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, /// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU /// buffer data access is not possible or more validation (i.e., /// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or -/// corruptable source. +/// corruptible source. ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, enum ArrowValidationLevel validation_level, struct ArrowError* error); @@ -1570,7 +1940,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, /// \defgroup nanoarrow-array-view Reading arrays /// -/// These functions read and validate the contents ArrowArray structures +/// These functions read and validate the contents ArrowArray structures. /// /// @{ @@ -1587,83 +1957,102 @@ static inline void ArrowArrayViewMove(struct ArrowArrayView* src, /// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - struct ArrowSchema* schema, + const struct ArrowSchema* schema, struct ArrowError* error); -/// \brief Allocate the schema_view->children array +/// \brief Allocate the array_view->children array /// /// Includes the memory for each child struct ArrowArrayView ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, int64_t n_children); +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); + /// \brief Set data-independent buffer sizes from length void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); /// \brief Set buffer sizes and data pointers from an ArrowArray ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - struct ArrowArray* array, struct ArrowError* error); + const struct ArrowArray* array, + struct ArrowError* error); -/// \brief Performs extra checks on the array that was set via ArrowArrayViewSetArray() -ArrowErrorCode ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error); +/// \brief Set buffer sizes and data pointers from an ArrowArray except for those +/// that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. See +/// documentation for ArrowValidationLevel for the details of checks performed +/// at each level. +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); /// \brief Reset the contents of an ArrowArrayView and frees resources void ArrowArrayViewReset(struct ArrowArrayView* array_view); /// \brief Check for a null element in an ArrowArrayView -static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i); +static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, + int64_t i); /// \brief Get the type id of a union array element -static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, +static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the child index of a union array element -static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, - int64_t i); +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the index to use into the relevant union child array -static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, - int64_t i); +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an integer /// /// This function does not check for null values, that values are actually integers, or /// that values are within a valid range for an int64. -static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, +static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an unsigned integer /// /// This function does not check for null values, that values are actually integers, or /// that values are within a valid range for a uint64. -static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, - int64_t i); +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as a double /// /// This function does not check for null values, or /// that values are within a valid range for a double. -static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, - int64_t i); +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an ArrowStringView /// /// This function does not check for null values. static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( - struct ArrowArrayView* array_view, int64_t i); + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an ArrowBufferView /// /// This function does not check for null values. static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( - struct ArrowArrayView* array_view, int64_t i); + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an ArrowDecimal /// /// This function does not check for null values. The out parameter must /// be initialized with ArrowDecimalInit() with the proper parameters for this /// type before calling this for the first time. -static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, +static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, int64_t i, struct ArrowDecimal* out); /// @} @@ -1700,11 +2089,17 @@ void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_ /// array_stream must have been initialized with ArrowBasicArrayStreamInit(). /// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() /// to validate the contents of the arrays. -ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, struct ArrowError* error); /// @} +// Undefine ArrowErrorCode, which may have been defined to annotate functions that return +// it to warn for an unused result. +#if defined(ArrowErrorCode) +#undef ArrowErrorCode +#endif + // Inline function definitions @@ -1938,20 +2333,124 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) { return (bits >> 3) + ((bits & 7) != 0); } +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { - *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | - values[5] << 5 | values[6] << 6 | values[7] << 7); + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); } static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { - *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | - values[5] << 5 | values[6] << 6 | values[7] << 7); + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); } static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { return (bits[i >> 3] >> (i & 0x07)) & 1; } +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt32(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + static inline void ArrowBitSet(uint8_t* bits, int64_t i) { bits[i / 8] |= _ArrowkBitmask[i % 8]; } @@ -2012,36 +2511,37 @@ static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_end = i_end / 8 + 1; + const int64_t bytes_last_valid = i_last_valid / 8; - if (bytes_end == bytes_begin + 1) { + if (bytes_begin == bytes_last_valid) { // count bits within a single byte const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; return _ArrowkBytePopcount[byte_masked]; } const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; int64_t count = 0; // first byte count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; // middle bytes - for (int64_t i = bytes_begin + 1; i < (bytes_end - 1); i++) { + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { count += _ArrowkBytePopcount[bits[i]]; } // last byte - count += _ArrowkBytePopcount[bits[bytes_end - 1] & ~last_byte_mask]; + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; return count; } @@ -2166,7 +2666,7 @@ static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, if ((out_i_cursor % 8) != 0) { int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); } out_cursor++; @@ -2189,7 +2689,7 @@ static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, // Zero out the last byte *out_cursor = 0x00; for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); } out_cursor++; } @@ -2263,15 +2763,17 @@ static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int // is made. static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, int8_t type_id) { + NANOARROW_UNUSED(array); return type_id; } static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, int8_t child_index) { + NANOARROW_UNUSED(array); return child_index; } -static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { +static inline int32_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { if (*type_ids == '\0') { return 0; } @@ -2286,7 +2788,7 @@ static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) } if (out != NULL) { - out[i] = type_id; + out[i] = (int8_t)type_id; } i++; @@ -2323,7 +2825,7 @@ static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, int64_t n_children) { int8_t type_ids[128]; - int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); } @@ -2350,7 +2852,7 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } // Initialize any data offset buffer with a single zero - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && private_data->layout.element_size_bits[i] == 64) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); @@ -2360,16 +2862,20 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } } - // Start building any child arrays + // Start building any child arrays or dictionaries for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); } @@ -2378,6 +2884,10 @@ static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); } + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + return NANOARROW_OK; } @@ -2480,7 +2990,7 @@ static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* a struct ArrowBuffer* buffer; int64_t size_bytes; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { buffer = ArrowArrayBuffer(array, i); size_bytes = private_data->layout.element_size_bits[i] / 8; @@ -2559,10 +3069,10 @@ static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); return ArrowArrayAppendUInt(array, value); case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); break; case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); @@ -2591,28 +3101,28 @@ static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); break; case NANOARROW_TYPE_UINT32: - _NANOARROW_CHECK_RANGE(value, 0, UINT32_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); break; case NANOARROW_TYPE_UINT16: - _NANOARROW_CHECK_RANGE(value, 0, UINT16_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); break; case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_RANGE(value, 0, UINT8_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); break; case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); return ArrowArrayAppendInt(array, value); case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); break; case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, value)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); @@ -2671,11 +3181,11 @@ static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: offset = ((int32_t*)offset_buffer->data)[array->length]; - if ((offset + value.size_bytes) > INT32_MAX) { - return EINVAL; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; } - offset += value.size_bytes; + offset += (int32_t)value.size_bytes; NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); @@ -2723,14 +3233,59 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, switch (private_data->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: return ArrowArrayAppendBytes(array, buffer_view); default: return EINVAL; } } +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + const struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + array->length++; + return NANOARROW_OK; +} + static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - struct ArrowDecimal* value) { + const struct ArrowDecimal* value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); @@ -2775,7 +3330,7 @@ static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { case NANOARROW_TYPE_MAP: child_length = array->children[0]->length; if (child_length > INT32_MAX) { - return EINVAL; + return EOVERFLOW; } NANOARROW_RETURN_NOT_OK( ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); @@ -2824,7 +3379,7 @@ static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* arr switch (private_data->storage_type) { case NANOARROW_TYPE_DENSE_UNION: - // Apppend the target child length to the union offsets buffer + // Append the target child length to the union offsets buffer _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); @@ -2862,9 +3417,10 @@ static inline void ArrowArrayViewMove(struct ArrowArrayView* src, ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); } -static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i) { +static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, + int64_t i) { const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; - i += array_view->array->offset; + i += array_view->offset; switch (array_view->storage_type) { case NANOARROW_TYPE_NA: return 0x01; @@ -2877,7 +3433,7 @@ static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int } } -static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, +static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_DENSE_UNION: @@ -2888,8 +3444,8 @@ static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view } } -static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, - int64_t i) { +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i) { int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); if (array_view->union_type_id_map == NULL) { return type_id; @@ -2898,8 +3454,8 @@ static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_ } } -static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, - int64_t i) { +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_DENSE_UNION: return array_view->buffer_views[1].data.as_int32[i]; @@ -2910,15 +3466,28 @@ static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* arra } } -static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, +static inline int64_t ArrowArrayViewListChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, int64_t i) { - struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - i += array_view->array->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: return data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: @@ -2932,9 +3501,9 @@ static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_vi case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; + return (int64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; + return (int64_t)data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -2942,15 +3511,16 @@ static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_vi } } -static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, - int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* data_view = &array_view->buffer_views[1]; +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: return data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: @@ -2964,9 +3534,9 @@ static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_ case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; + return (uint64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; + return (uint64_t)data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: @@ -2974,15 +3544,15 @@ static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_ } } -static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, - int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* data_view = &array_view->buffer_views[1]; +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; + return (double)data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; + return (double)data_view->data.as_uint64[i]; case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: @@ -3007,9 +3577,9 @@ static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_ } static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( - struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; const char* data_view = array_view->buffer_views[2].data.as_char; struct ArrowStringView view; @@ -3040,9 +3610,9 @@ static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( } static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( - struct ArrowArrayView* array_view, int64_t i) { - i += array_view->array->offset; - struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; struct ArrowBufferView view; @@ -3073,9 +3643,36 @@ static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( return view; } -static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, +static inline void ArrowArrayViewGetIntervalUnsafe( + const struct ArrowArrayView* array_view, int64_t i, struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, int64_t i, struct ArrowDecimal* out) { - i += array_view->array->offset; + i += array_view->offset; const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; switch (array_view->storage_type) { case NANOARROW_TYPE_DECIMAL128: diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp index 2430dd8d81..61fe84f1e3 100644 --- a/apis/r/src/rutilities.cpp +++ b/apis/r/src/rutilities.cpp @@ -205,7 +205,7 @@ Rcpp::XPtr schema_setup_struct(Rcpp::XPtr schxp, int64 } extern "C" { - void ArrowArrayRelease(struct ArrowArray *array); // made non-static in nanoarrow.c + void ArrowArrayReleaseInternal(struct ArrowArray *array); // non-static in nanoarrow.c ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, // ditto enum ArrowType storage_type); } @@ -222,7 +222,7 @@ Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t array->buffers = NULL; array->children = NULL; array->dictionary = NULL; - array->release = &ArrowArrayRelease; + array->release = &ArrowArrayReleaseInternal; array->private_data = NULL; auto private_data = (struct ArrowArrayPrivateData*) ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); From 9a5e0486f1ada671bb7153c51ee87b8cdf23f5e2 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 1 Mar 2024 09:06:17 -0600 Subject: [PATCH 02/39] Low-level wiring of nanoarrow at sr_* level --- apis/r/DESCRIPTION | 6 ++- apis/r/NAMESPACE | 1 + apis/r/R/RcppExports.R | 6 +++ apis/r/src/RcppExports.cpp | 13 +++++- apis/r/src/riterator.cpp | 87 +++++++++++++++++++++++++------------- apis/r/src/rutilities.h | 14 ++++++ 6 files changed, 94 insertions(+), 33 deletions(-) diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index b98fc73395..82ea7bf358 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -45,11 +45,13 @@ Imports: spdl, rlang, tools, - tibble + tibble, + nanoarrow LinkingTo: Rcpp, RcppSpdlog, - RcppInt64 + RcppInt64, + nanoarrow Additional_repositories: https://ghrr.github.io/drat Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 diff --git a/apis/r/NAMESPACE b/apis/r/NAMESPACE index 2607cb7deb..7faba888a5 100644 --- a/apis/r/NAMESPACE +++ b/apis/r/NAMESPACE @@ -76,6 +76,7 @@ export(tiledbsoma_stats_show) export(write_soma) import(R6) import(methods) +import(nanoarrow) import(utils) importFrom(Matrix,as.matrix) importFrom(Matrix,sparseMatrix) diff --git a/apis/r/R/RcppExports.R b/apis/r/R/RcppExports.R index 6612d43103..677969b4f6 100644 --- a/apis/r/R/RcppExports.R +++ b/apis/r/R/RcppExports.R @@ -92,6 +92,12 @@ sr_complete <- function(sr) { .Call(`_tiledbsoma_sr_complete`, sr) } +#' @noRd +#' @import nanoarrow +create_empty_arrow_table <- function() { + .Call(`_tiledbsoma_create_empty_arrow_table`) +} + sr_next <- function(sr) { .Call(`_tiledbsoma_sr_next`, sr) } diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 25652298d9..15dffdea4f 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -129,8 +129,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// create_empty_arrow_table +nanoarrowXPtr create_empty_arrow_table(); +RcppExport SEXP _tiledbsoma_create_empty_arrow_table() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(create_empty_arrow_table()); + return rcpp_result_gen; +END_RCPP +} // sr_next -Rcpp::List sr_next(Rcpp::XPtr sr); +nanoarrowXPtr sr_next(Rcpp::XPtr sr); RcppExport SEXP _tiledbsoma_sr_next(SEXP srSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -220,6 +230,7 @@ static const R_CallMethodDef CallEntries[] = { {"_tiledbsoma_shape", (DL_FUNC) &_tiledbsoma_shape, 2}, {"_tiledbsoma_sr_setup", (DL_FUNC) &_tiledbsoma_sr_setup, 10}, {"_tiledbsoma_sr_complete", (DL_FUNC) &_tiledbsoma_sr_complete, 1}, + {"_tiledbsoma_create_empty_arrow_table", (DL_FUNC) &_tiledbsoma_create_empty_arrow_table, 0}, {"_tiledbsoma_sr_next", (DL_FUNC) &_tiledbsoma_sr_next, 1}, {"_tiledbsoma_tiledbsoma_stats_enable", (DL_FUNC) &_tiledbsoma_tiledbsoma_stats_enable, 0}, {"_tiledbsoma_tiledbsoma_stats_disable", (DL_FUNC) &_tiledbsoma_tiledbsoma_stats_disable, 0}, diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index 1cd170aacc..f479a986b2 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -4,14 +4,15 @@ #endif #include // for R interface to C++ -#include // for C interface to Arrow +#include // for C interface to Arrow +#include "nanoarrow.h" #include #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 4 #include #endif -// We get these via nanoarrow and must cannot include carrow.h again +// We get these via nanoarrow and must not include carrow.h again #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 #include @@ -164,20 +165,35 @@ bool sr_complete(Rcpp::XPtr sr) { return res; } -Rcpp::List create_empty_arrow_table() { - Rcpp::XPtr schemaxp = schema_owning_xptr(); - Rcpp::XPtr arrayxp = array_owning_xptr(); - schemaxp = schema_setup_struct(schemaxp, 0); - arrayxp = array_setup_struct(arrayxp, 0); - arrayxp->length = 0; - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayxp, - Rcpp::Named("schema") = schemaxp); - return as; +//' @noRd +//' @import nanoarrow +// [[Rcpp::export]] +nanoarrowXPtr create_empty_arrow_table() { + int ncol = 0; + + // Schema first + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); + + // Array second + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + arr->length = 0; + + // Nanoarrow special: stick schema into xptr tag to return single SEXP + array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + + return arrayxp; } // [[Rcpp::export]] -Rcpp::List sr_next(Rcpp::XPtr sr) { +nanoarrowXPtr sr_next(Rcpp::XPtr sr) { check_xptr_tag(sr); if (sr_complete(sr)) { @@ -198,16 +214,30 @@ Rcpp::List sr_next(Rcpp::XPtr sr) { const std::vector names = sr_data->get()->names(); auto ncol = names.size(); - Rcpp::XPtr schemaxp = schema_owning_xptr(); - Rcpp::XPtr arrayxp = array_owning_xptr(); - schemaxp = schema_setup_struct(schemaxp, ncol); - arrayxp = array_setup_struct(arrayxp, ncol); - arrayxp->length = 0; + //Rcpp::XPtr schemaxp = schema_owning_xptr(); + //Rcpp::XPtr arrayxp = array_owning_xptr(); + //schemaxp = schema_setup_struct(schemaxp, ncol); + //arrayxp = array_setup_struct(arrayxp, ncol); + //arrayxp->length = 0; + // Schema first + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); + + // Array second + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + + arr->length = 0; // initial value for (size_t i=0; i chldschemaxp = schema_owning_xptr(); - Rcpp::XPtr chldarrayxp = array_owning_xptr(); + //Rcpp::XPtr chldschemaxp = schema_owning_xptr(); + //Rcpp::XPtr chldarrayxp = array_owning_xptr(); spdl::trace("[sr_next] Accessing {} at {}", names[i], i); @@ -217,21 +247,18 @@ Rcpp::List sr_next(Rcpp::XPtr sr) { // this is pair of array and schema pointer auto pp = tdbs::ArrowAdapter::to_arrow(buf); - memcpy((void*) chldschemaxp, pp.second.get(), sizeof(ArrowSchema)); - memcpy((void*) chldarrayxp, pp.first.get(), sizeof(ArrowArray)); - - schemaxp->children[i] = chldschemaxp; - arrayxp->children[i] = chldarrayxp; + memcpy((void*) sch->children[i], pp.second.get(), sizeof(ArrowSchema)); + memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); - if (pp.first->length > arrayxp->length) { + if (pp.first->length > arr->length) { spdl::debug("[soma_array_reader] Setting array length to {}", pp.first->length); - arrayxp->length = pp.first->length; + arr->length = pp.first->length; } } - spdl::debug("[sr_next] Exporting chunk with {} rows", arrayxp->length); - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayxp, - Rcpp::Named("schema") = schemaxp); - return as; + spdl::debug("[sr_next] Exporting chunk with {} rows", arr->length); + // Nanoarrow special: stick schema into xptr tag to return single SEXP + array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + return arrayxp; } diff --git a/apis/r/src/rutilities.h b/apis/r/src/rutilities.h index 3669252ae9..7a5acbf586 100644 --- a/apis/r/src/rutilities.h +++ b/apis/r/src/rutilities.h @@ -62,3 +62,17 @@ struct ContextWrapper { std::shared_ptr ctxptr; }; typedef struct ContextWrapper ctx_wrap_t; + +// some lipstick on the (plain C language) pig that is a SEXP: +// allowing the nanoarrow ArrowArray XPtr be typedef'ed +typedef SEXP nanoarrowXPtr; + +inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { + if (ec != NANOARROW_OK) Rcpp::stop(msg); +} + +// Attaches a schema to an array external pointer. The nanoarrow R package +// attempts to do this whenever possible to avoid misinterpreting arrays. +inline void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) { + R_SetExternalPtrTag(array_xptr, schema_xptr); +} From 0d7761411e7ca388d333d70a632b0aa8839fdf3a Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 1 Mar 2024 12:57:49 -0600 Subject: [PATCH 03/39] More lower-level wiring of nanoarrow --- apis/r/R/utils-arrow.R | 5 +- apis/r/R/utils-readerTransformers.R | 7 +- apis/r/src/RcppExports.cpp | 2 +- apis/r/src/rinterface.cpp | 70 +++++++++---------- apis/r/src/riterator.cpp | 7 +- .../testthat/test-SOMAArrayReader-Iterated.R | 10 +-- libtiledbsoma/src/utils/arrow_adapter.cc | 2 + 7 files changed, 46 insertions(+), 57 deletions(-) diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R index 00d365caa0..cda8448b05 100644 --- a/apis/r/R/utils-arrow.R +++ b/apis/r/R/utils-arrow.R @@ -240,11 +240,10 @@ arrow_schema_from_tiledb_schema <- function(x) { arrow::schema(c(dimfields, attfields)) } -#' Validate external pointer to ArrowArray +#' Validate external pointer to ArrowArray which is embedded in a nanoarrow S3 type #' @noRd check_arrow_pointers <- function(arrlst) { - stopifnot("First argument must be an external pointer to ArrowArray" = check_arrow_array_tag(arrlst[[1]]), - "Second argument must be an external pointer to ArrowSchema" = check_arrow_schema_tag(arrlst[[2]])) + stopifnot(inherits(arrlst, "nanoarrow_array")) } #' Validate compatibility of Arrow data types diff --git a/apis/r/R/utils-readerTransformers.R b/apis/r/R/utils-readerTransformers.R index 0256b71146..ea52122adb 100644 --- a/apis/r/R/utils-readerTransformers.R +++ b/apis/r/R/utils-readerTransformers.R @@ -2,14 +2,13 @@ #' #' @description Converts the results of a \link{soma_array_reader} or #' \link{sr_next} to an arrow::\link[arrow]{Table} -#' @param x A List object with two pointers to Arrow array data and schema +#' @param x A nanoarrow_array object which is itself a wrapper around the external pointer +#' to the Arrow array data; the schema external pointer is added to it as well #' @return arrow::\link[arrow]{Table} #' @noRd soma_array_to_arrow_table <- function(x) { check_arrow_pointers(x) - arrow::as_arrow_table( - arrow::RecordBatch$import_from_c(x$array_data, x$schema) - ) + arrow::as_arrow_table(x) } #' Transformer function: Arrow table to Matrix::sparseMatrix diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 15dffdea4f..0d3edad527 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -12,7 +12,7 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // soma_array_reader -Rcpp::List soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); +nanoarrowXPtr soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); RcppExport SEXP _tiledbsoma_soma_array_reader(SEXP uriSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP loglevelSEXP, SEXP configSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 4a312bb4c3..aef0a2a13b 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -1,4 +1,5 @@ #include // for R interface to C++ +#include // for C interface to Arrow #include // for C interface to Arrow #include // for fromInteger64 @@ -48,15 +49,15 @@ Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t //' @noRd // [[Rcpp::export(soma_array_reader_impl)]] -Rcpp::List soma_array_reader(const std::string& uri, - Rcpp::Nullable colnames = R_NilValue, - Rcpp::Nullable> qc = R_NilValue, - Rcpp::Nullable dim_points = R_NilValue, - Rcpp::Nullable dim_ranges = R_NilValue, - std::string batch_size = "auto", - std::string result_order = "auto", - const std::string& loglevel = "auto", - Rcpp::Nullable config = R_NilValue) { +nanoarrowXPtr soma_array_reader(const std::string& uri, + Rcpp::Nullable colnames = R_NilValue, + Rcpp::Nullable> qc = R_NilValue, + Rcpp::Nullable dim_points = R_NilValue, + Rcpp::Nullable dim_ranges = R_NilValue, + std::string batch_size = "auto", + std::string result_order = "auto", + const std::string& loglevel = "auto", + Rcpp::Nullable config = R_NilValue) { if (loglevel != "auto") { spdl::set_level(loglevel); @@ -79,7 +80,7 @@ Rcpp::List soma_array_reader(const std::string& uri, // Read selected columns from the uri (return is unique_ptr) auto sr = tdbs::SOMAArray::open(OpenMode::read, - uri, + uri, "unnamed", // name parameter could be added platform_config, column_names, @@ -130,17 +131,22 @@ Rcpp::List soma_array_reader(const std::string& uri, const std::vector names = sr_data->get()->names(); auto ncol = names.size(); - Rcpp::XPtr schemaxp = schema_owning_xptr(); - Rcpp::XPtr arrayxp = array_owning_xptr(); - schemaxp = schema_setup_struct(schemaxp, ncol); - arrayxp = array_setup_struct(arrayxp, ncol); - arrayxp->length = 0; + // Schema first + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError(ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, ncol), "Bad schema children alloc"); - for (size_t i=0; i chldschemaxp = schema_owning_xptr(); - Rcpp::XPtr chldarrayxp = array_owning_xptr(); + // Array second + auto arrayxp = nanoarrow_array_owning_xptr(); + auto arr = nanoarrow_output_array_from_xptr(arrayxp); + exitIfError(ArrowArrayInitFromType(arr, NANOARROW_TYPE_STRUCT), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, ncol), "Bad array children alloc"); + + arr->length = 0; // initial value + for (size_t i=0; ichildren[i], pp.second.get(), sizeof(ArrowSchema)); + memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); spdl::info("[soma_array_reader] Incoming name {} length {}", std::string(pp.second->name), pp.first->length); - schemaxp->children[i] = chldschemaxp; - arrayxp->children[i] = chldarrayxp; - - // if (buf->has_enumeration()) { - // auto vec = buf->get_enumeration(); - // Rcpp::Rcout << names[i] << ": "; - // for (auto& s: vec) { - // Rcpp::Rcout << s << " "; - // } - // Rcpp::Rcout << std::endl; - // } - - if (pp.first->length > arrayxp->length) { + if (pp.first->length > arr->length) { spdl::debug("[soma_array_reader] Setting array length to {}", pp.first->length); - arrayxp->length = pp.first->length; + arr->length = pp.first->length; } } - Rcpp::List as = Rcpp::List::create(Rcpp::Named("array_data") = arrayxp, - Rcpp::Named("schema") = schemaxp); - return as; + // Nanoarrow special: stick schema into xptr tag to return single SEXP + array_xptr_set_schema(arrayxp, schemaxp); // embed schema in array + return arrayxp; } //' Set the logging level for the R package and underlying C++ library diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index f479a986b2..b86c947880 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -5,7 +5,7 @@ #include // for R interface to C++ #include // for C interface to Arrow -#include "nanoarrow.h" +#include #include #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 4 @@ -214,11 +214,6 @@ nanoarrowXPtr sr_next(Rcpp::XPtr sr) { const std::vector names = sr_data->get()->names(); auto ncol = names.size(); - //Rcpp::XPtr schemaxp = schema_owning_xptr(); - //Rcpp::XPtr arrayxp = array_owning_xptr(); - //schemaxp = schema_setup_struct(schemaxp, ncol); - //arrayxp = array_setup_struct(arrayxp, ncol); - //arrayxp->length = 0; // Schema first auto schemaxp = nanoarrow_schema_owning_xptr(); auto sch = nanoarrow_output_schema_from_xptr(schemaxp); diff --git a/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R b/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R index 4f6509d883..6f19409a40 100644 --- a/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R +++ b/apis/r/tests/testthat/test-SOMAArrayReader-Iterated.R @@ -29,9 +29,8 @@ test_that("Iterated Interface from SOMAArrayReader", { expect_true(is.data.frame(rl)) expect_equal(nrow(rl), 4848644) expect_equal(ncol(rl), 3) - rm(sr) - #gc() + gc() srret <- sr_setup(uri, config=as.character(config), dim_points=list(soma_dim_0=as.integer64(1))) sr <- srret$sr @@ -50,7 +49,7 @@ test_that("Iterated Interface from SOMAArrayReader", { expect_equal(ncol(rl), 3) rm(sr) - #gc() + gc() srret <- sr_setup(uri, config=as.character(config), dim_range=list(soma_dim_1=cbind(as.integer64(1),as.integer64(2)))) sr <- srret$sr @@ -195,6 +194,7 @@ test_that("Iterated Interface from SOMA Sparse Matrix", { test_that("Dimension Point and Ranges Bounds", { skip_if(!extended_tests() || covr_tests()) ctx <- tiledbsoma::SOMATileDBContext$new() + config <- as.character(tiledb::config(ctx$context())) human_experiment <- load_dataset("soma-exp-pbmc-small", tiledbsoma_ctx = ctx) X <- human_experiment$ms$get("RNA")$X$get("data") @@ -207,7 +207,7 @@ test_that("Dimension Point and Ranges Bounds", { sr <- srret$sr chunk <- sr_next(sr) - at <- arrow::as_arrow_table(arrow::RecordBatch$import_from_c(chunk$array_data, chunk$schema)) + at <- arrow::as_arrow_table(chunk) expect_equal(at$num_rows, 5) expect_equal(at$num_columns, 3) rm(sr) @@ -220,7 +220,7 @@ test_that("Dimension Point and Ranges Bounds", { sr <- srret$sr chunk <- sr_next(sr) - at <- arrow::as_arrow_table(arrow::RecordBatch$import_from_c(chunk$array_data, chunk$schema)) + at <- arrow::as_arrow_table(chunk) expect_equal(at$num_rows, 2) expect_equal(at$num_columns, 3) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4f52c0a9be..45224f0000 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -39,6 +39,7 @@ namespace tiledbsoma { using namespace tiledb; void ArrowAdapter::release_schema(struct ArrowSchema* schema) { + return; // FIXME: switch to nanoarrow release schema->release = nullptr; for (int i = 0; i < schema->n_children; ++i) { @@ -70,6 +71,7 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { } void ArrowAdapter::release_array(struct ArrowArray* array) { + return; // FIXME: switch to nanoarrow release auto arrow_buffer = static_cast(array->private_data); LOG_TRACE(fmt::format( From 7d37e6b53143031a984a80d25f17c332907d609d Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Wed, 6 Mar 2024 15:11:44 -0600 Subject: [PATCH 04/39] WIP snapshot with nanoarrow wired into libtiledbsoma --- apis/r/src/rinterface.cpp | 6 +- apis/r/src/riterator.cpp | 7 +- apis/r/tests/testthat/helper-test-data.R | 17 +- apis/r/tests/testthat/test-Factory.R | 4 +- libtiledbsoma/src/CMakeLists.txt | 3 + libtiledbsoma/src/utils/arrow_adapter.cc | 137 +- libtiledbsoma/src/utils/arrow_adapter.h | 10 +- libtiledbsoma/src/utils/nanoarrow.c | 3369 +++++++++++++++++++ libtiledbsoma/src/utils/nanoarrow.h | 3734 ++++++++++++++++++++++ libtiledbsoma/src/utils/nanoarrow.hpp | 553 ++++ 10 files changed, 7800 insertions(+), 40 deletions(-) create mode 100644 libtiledbsoma/src/utils/nanoarrow.c create mode 100644 libtiledbsoma/src/utils/nanoarrow.h create mode 100644 libtiledbsoma/src/utils/nanoarrow.hpp diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index aef0a2a13b..666e31a06b 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -155,8 +155,10 @@ nanoarrowXPtr soma_array_reader(const std::string& uri, // this is pair of array and schema pointer auto pp = tdbs::ArrowAdapter::to_arrow(buf); - memcpy((void*) sch->children[i], pp.second.get(), sizeof(ArrowSchema)); - memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); + //memcpy((void*) sch->children[i], pp.second.get(), sizeof(ArrowSchema)); + //memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); + ArrowArrayMove(pp.first.get(), arr->children[i]); + ArrowSchemaMove(pp.second.get(), sch->children[i]); spdl::info("[soma_array_reader] Incoming name {} length {}", std::string(pp.second->name), pp.first->length); diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index b86c947880..b4bfdcc0df 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -242,14 +242,15 @@ nanoarrowXPtr sr_next(Rcpp::XPtr sr) { // this is pair of array and schema pointer auto pp = tdbs::ArrowAdapter::to_arrow(buf); - memcpy((void*) sch->children[i], pp.second.get(), sizeof(ArrowSchema)); - memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); + //memcpy((void*) sch->children[i], pp.second.get(), sizeof(ArrowSchema)); + //memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); + ArrowArrayMove(pp.first.get(), arr->children[i]); + ArrowSchemaMove(pp.second.get(), sch->children[i]); if (pp.first->length > arr->length) { spdl::debug("[soma_array_reader] Setting array length to {}", pp.first->length); arr->length = pp.first->length; } - } spdl::debug("[sr_next] Exporting chunk with {} rows", arr->length); diff --git a/apis/r/tests/testthat/helper-test-data.R b/apis/r/tests/testthat/helper-test-data.R index 7a51dd14e1..c690bd0e48 100644 --- a/apis/r/tests/testthat/helper-test-data.R +++ b/apis/r/tests/testthat/helper-test-data.R @@ -39,19 +39,20 @@ create_dense_matrix_with_int_dims <- function(nrows = 10, ncols = 5, seed = 1) { } create_arrow_schema <- function(foo_first = TRUE) { + bl <- FALSE if (foo_first) { arrow::schema( - arrow::field("foo", arrow::int32(), nullable = FALSE), - arrow::field("soma_joinid", arrow::int64(), nullable = FALSE), - arrow::field("bar", arrow::float64(), nullable = FALSE), - arrow::field("baz", arrow::large_utf8(), nullable = FALSE) + arrow::field("foo", arrow::int32(), nullable = bl), + arrow::field("soma_joinid", arrow::int64(), nullable = bl), + arrow::field("bar", arrow::float64(), nullable = bl), + arrow::field("baz", arrow::large_utf8(), nullable = bl) ) } else { arrow::schema( - arrow::field("soma_joinid", arrow::int64(), nullable = FALSE), - arrow::field("foo", arrow::int32(), nullable = FALSE), - arrow::field("bar", arrow::float64(), nullable = FALSE), - arrow::field("baz", arrow::large_utf8(), nullable = FALSE) + arrow::field("soma_joinid", arrow::int64(), nullable = bl), + arrow::field("foo", arrow::int32(), nullable = bl), + arrow::field("bar", arrow::float64(), nullable = bl), + arrow::field("baz", arrow::large_utf8(), nullable = bl) ) } } diff --git a/apis/r/tests/testthat/test-Factory.R b/apis/r/tests/testthat/test-Factory.R index 019b486d68..59e1896310 100644 --- a/apis/r/tests/testthat/test-Factory.R +++ b/apis/r/tests/testthat/test-Factory.R @@ -16,7 +16,7 @@ test_that("DataFrame Factory", { # Check opening to read expect_silent(d3 <- SOMADataFrameOpen(uri)) expect_silent(chk <- d3$read()$concat()) - expect_equal(tbl, chk) + expect_equal(tibble::as_tibble(tbl), tibble::as_tibble(chk)) }) test_that("DataFrame Factory with specified index_column_names", { @@ -35,7 +35,7 @@ test_that("DataFrame Factory with specified index_column_names", { expect_silent(d3 <- SOMADataFrameOpen(uri)) expect_equal(d3$mode(), "READ") expect_silent(chk <- d3$read()$concat()) - expect_equal(tbl, chk) + expect_equal(tibble::as_tibble(tbl), tibble::as_tibble(chk)) d3$close() expect_equal(d3$mode(), "CLOSED") }) diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index 4205da0319..4aa44f4385 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -69,6 +69,7 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.c ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/thread_pool.cc ${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/status.cc @@ -214,6 +215,8 @@ install(FILES install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/utils/carrow.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.hpp DESTINATION "include/tiledbsoma/utils/" ) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 45224f0000..90e677b059 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -39,14 +39,26 @@ namespace tiledbsoma { using namespace tiledb; void ArrowAdapter::release_schema(struct ArrowSchema* schema) { - return; // FIXME: switch to nanoarrow release + LOG_DEBUG(fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); schema->release = nullptr; + if (schema->name != nullptr) { + free((void*)schema->name); + schema->name = nullptr; + } + if (schema->format != nullptr) { + free((void*)schema->format); + schema->format = nullptr; + } for (int i = 0; i < schema->n_children; ++i) { struct ArrowSchema* child = schema->children[i]; - if (schema->name != nullptr) { - free((void*)schema->name); - schema->name = nullptr; + if (child->name != nullptr) { + free((void*)child->name); + child->name = nullptr; + } + if (child->format != nullptr) { + free((void*)child->format); + child->format = nullptr; } if (child->release != NULL) { child->release(child); @@ -57,22 +69,32 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { struct ArrowSchema* dict = schema->dictionary; if (dict != nullptr) { + if (dict->name != nullptr) { + free((void*)dict->name); + dict->name = nullptr; + } if (dict->format != nullptr) { free((void*)dict->format); dict->format = nullptr; } if (dict->release != nullptr) { - delete dict; + //delete dict; + free(dict); dict = nullptr; } } - LOG_TRACE("[ArrowAdapter] release_schema"); } void ArrowAdapter::release_array(struct ArrowArray* array) { - return; // FIXME: switch to nanoarrow release auto arrow_buffer = static_cast(array->private_data); + LOG_DEBUG(fmt::format("[ArrowAdapter] release_array for {} cnt {} var {} nullable {} enum {}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count(), + arrow_buffer->buffer_->is_var(), + arrow_buffer->buffer_->is_nullable(), + arrow_buffer->buffer_->has_enumeration() + )); LOG_TRACE(fmt::format( "[ArrowAdapter] release_array {} use_count={}", @@ -86,6 +108,20 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { if (array->buffers != nullptr) { delete[] array->buffers; + array->buffers = nullptr; + } + + if (array->n_children > 0) { + for (int i = 0; i < array->n_children; ++i) { + struct ArrowArray* child = array->children[i]; + if (child != nullptr) { + release_array(child); + free(child); + child = nullptr; + } + } + free(array->children); + array->children = nullptr; } struct ArrowArray* dict = array->dictionary; @@ -95,12 +131,13 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { dict->buffers = nullptr; } if (dict->release != nullptr) { - delete dict; + //delete dict; + free(dict); dict = nullptr; } } - array->release = nullptr; + } std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( @@ -113,13 +150,13 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( arrow_schema->format = "+s"; arrow_schema->n_children = ndim + nattr; arrow_schema->release = &ArrowAdapter::release_schema; - arrow_schema->children = new ArrowSchema*[arrow_schema->n_children]; + arrow_schema->children = (ArrowSchema**) malloc(arrow_schema->n_children * sizeof(ArrowSchema*)); //new ArrowSchema*[arrow_schema->n_children]; ArrowSchema* child = nullptr; for (uint32_t i = 0; i < ndim; ++i) { auto dim = tiledb_schema.domain().dimension(i); - child = arrow_schema->children[i] = new ArrowSchema; + child = arrow_schema->children[i] = (ArrowSchema*) malloc(sizeof(ArrowSchema)); //new ArrowSchema; child->format = ArrowAdapter::to_arrow_format(dim.type()).data(); child->name = strdup(dim.name().c_str()); child->metadata = nullptr; @@ -132,7 +169,7 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( for (uint32_t i = 0; i < nattr; ++i) { auto attr = tiledb_schema.attribute(i); - child = arrow_schema->children[ndim + i] = new ArrowSchema; + child = arrow_schema->children[ndim + i] = (ArrowSchema*) malloc(sizeof(ArrowSchema)); //new ArrowSchema; child->format = ArrowAdapter::to_arrow_format(attr.type()).data(); child->name = strdup(attr.name().c_str()); child->metadata = nullptr; @@ -186,7 +223,7 @@ std::pair ArrowAdapter::_get_data_and_length( // Allocate a single byte to copy the bits into size_t sz = 1; - dst = new const void*[sz]; + dst = malloc(sz); //new const void*[sz]; std::memcpy((void*)dst, &src, sz); return std::pair(dst, data.size()); @@ -255,11 +292,26 @@ bool ArrowAdapter::_isstr(const char* format) { return false; } +inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { + if (ec != NANOARROW_OK) + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Arrow Error {} ", msg)); +} + std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { std::unique_ptr schema = std::make_unique(); std::unique_ptr array = std::make_unique(); + auto sch = schema.get(); + auto arr = array.get(); + auto coltype = to_arrow_format(column->type()).data(); + auto natype = to_nanoarrow_type(coltype); + exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); + +#if 0 schema->format = to_arrow_format(column->type()).data(); schema->name = column->name().data(); schema->metadata = nullptr; @@ -267,10 +319,11 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { schema->n_children = 0; schema->children = nullptr; schema->dictionary = nullptr; +#endif schema->release = &release_schema; schema->private_data = nullptr; - int n_buffers = column->is_var() ? 3 : 2; + int n_buffers = column->is_var() ? 3 : 2; // this will be 2 for enumerations and 3 for char vectors // Create an ArrowBuffer to manage the lifetime of `column`. // - `arrow_buffer` holds a shared_ptr to `column`, which @@ -284,7 +337,16 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { // will be deleted. auto arrow_buffer = new ArrowBuffer(column); + exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); array->length = column->size(); + + LOG_DEBUG(fmt::format("[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", + to_arrow_format(column->type()).data(), + column->name().data(), n_buffers, array->n_buffers, column->is_nullable())); + + +#if 0 array->null_count = 0; array->offset = 0; array->n_buffers = n_buffers; @@ -292,6 +354,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { array->buffers = nullptr; array->children = nullptr; array->dictionary = nullptr; +#endif array->release = &release_array; array->private_data = (void*)arrow_buffer; @@ -300,9 +363,9 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->name(), column.use_count())); - array->buffers = new const void*[n_buffers]; + array->buffers = (const void**) malloc(sizeof(void*) * n_buffers); //new const void*[n_buffers]; assert(array->buffers != nullptr); - array->buffers[0] = nullptr; // validity + array->buffers[0] = nullptr; // validity addressed below array->buffers[n_buffers - 1] = column->data().data(); // data if (n_buffers == 3) { array->buffers[1] = column->offsets().data(); // offsets @@ -330,10 +393,17 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } if (column->has_enumeration()) { - auto dict_sch = new ArrowSchema; - auto dict_arr = new ArrowArray; + auto dict_sch = (ArrowSchema*) malloc(sizeof(ArrowSchema)); //new ArrowSchema; + auto dict_arr = (ArrowArray*) malloc(sizeof(ArrowArray)); //new ArrowArray; auto enmr = column->get_enumeration_info(); + auto dcoltype = to_arrow_format(enmr->type(), false).data(); + auto dnatype = to_nanoarrow_type(dcoltype); + + exitIfError(ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); + exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(dict_sch, 0), "Bad schema children alloc"); +#if 0 dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); dict_sch->name = nullptr; dict_sch->metadata = nullptr; @@ -343,8 +413,15 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_sch->dictionary = nullptr; dict_sch->release = &release_schema; dict_sch->private_data = nullptr; +#endif + exitIfError(ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2; + dict_arr->buffers = (const void**) malloc(sizeof(void*) * n_buf); //new const void*[n_buf]; + dict_arr->buffers[0] = nullptr; // validity: none here + dict_arr->release = &release_array; +#if 0 dict_arr->null_count = 0; dict_arr->offset = 0; dict_arr->n_buffers = n_buf; @@ -352,11 +429,8 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_arr->buffers = nullptr; dict_arr->children = nullptr; dict_arr->dictionary = nullptr; - dict_arr->release = &release_array; dict_arr->private_data = nullptr; - - dict_arr->buffers = new const void*[n_buf]; - dict_arr->buffers[0] = nullptr; // validity: none here +#endif // TODO string types currently get the data and offset // buffers from ColumnBuffer::enum_offsets and @@ -445,4 +519,23 @@ std::string_view ArrowAdapter::to_arrow_format( tiledb::impl::type_to_str(datatype))); } +// FIXME: Add more types, maybe make it a map +enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { + if (sv == "i") return NANOARROW_TYPE_INT32; + else if (sv == "c") return NANOARROW_TYPE_INT8; + else if (sv == "C") return NANOARROW_TYPE_UINT8; + else if (sv == "s") return NANOARROW_TYPE_INT16; + else if (sv == "S") return NANOARROW_TYPE_UINT16; + else if (sv == "I") return NANOARROW_TYPE_UINT32; + else if (sv == "l") return NANOARROW_TYPE_INT64; + else if (sv == "L") return NANOARROW_TYPE_UINT64; + else if (sv == "f") return NANOARROW_TYPE_FLOAT; + else if (sv == "g") return NANOARROW_TYPE_DOUBLE; + else if (sv == "u") return NANOARROW_TYPE_STRING; + else if (sv == "U") return NANOARROW_TYPE_LARGE_STRING; + else if (sv == "b") return NANOARROW_TYPE_BOOL; + else throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 7ef09173d1..a84a37506c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -8,9 +8,11 @@ // https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout // https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-simple-int32-array -#ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED -#include "carrow.h" -#endif +#include "nanoarrow.hpp" +//#ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED +//#include "carrow.h" +//#endif + namespace tiledbsoma { using namespace tiledb; @@ -60,6 +62,8 @@ class ArrowAdapter { static std::string_view to_arrow_format( tiledb_datatype_t datatype, bool use_large = true); + static enum ArrowType to_nanoarrow_type(std::string_view sv); + private: static std::pair _get_data_and_length( Enumeration& enmr, const void* dst); diff --git a/libtiledbsoma/src/utils/nanoarrow.c b/libtiledbsoma/src/utils/nanoarrow.c new file mode 100644 index 0000000000..d7925587f5 --- /dev/null +++ b/libtiledbsoma/src/utils/nanoarrow.c @@ -0,0 +1,3369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "nanoarrow.h" + +const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } + +int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } + +ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { + if (error == NULL) { + return NANOARROW_OK; + } + + memset(error->message, 0, sizeof(error->message)); + + va_list args; + va_start(args, fmt); + int chars_needed = vsnprintf(error->message, sizeof(error->message), fmt, args); + va_end(args); + + if (chars_needed < 0) { + return EINVAL; + } else if (((size_t)chars_needed) >= sizeof(error->message)) { + return ERANGE; + } else { + return NANOARROW_OK; + } +} + +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; + + layout->element_size_bits[0] = 1; + layout->element_size_bits[1] = 0; + layout->element_size_bits[2] = 0; + + layout->child_size_elements = 0; + + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + layout->element_size_bits[0] = 0; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_LARGE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_BOOL: + layout->element_size_bits[1] = 1; + break; + + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + layout->element_size_bits[1] = 8; + break; + + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_HALF_FLOAT: + layout->element_size_bits[1] = 16; + break; + + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; + case NANOARROW_TYPE_INTERVAL_MONTHS: + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + layout->element_size_bits[1] = 128; + break; + + case NANOARROW_TYPE_DECIMAL256: + layout->element_size_bits[1] = 256; + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; + break; + + case NANOARROW_TYPE_DENSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_SPARSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; + break; + + case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; + case NANOARROW_TYPE_LARGE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; + break; + + default: + break; + } +} + +void* ArrowMalloc(int64_t size) { return malloc(size); } + +void* ArrowRealloc(void* ptr, int64_t size) { return realloc(ptr, size); } + +void ArrowFree(void* ptr) { free(ptr); } + +static uint8_t* ArrowBufferAllocatorMallocReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(old_size); + return (uint8_t*)ArrowRealloc(ptr, new_size); +} + +static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(size); + if (ptr != NULL) { + ArrowFree(ptr); + } +} + +static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { + &ArrowBufferAllocatorMallocReallocate, &ArrowBufferAllocatorMallocFree, NULL}; + +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { + return ArrowBufferAllocatorMalloc; +} + +static uint8_t* ArrowBufferDeallocatorReallocate(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t old_size, + int64_t new_size) { + NANOARROW_UNUSED(new_size); + + // Attempting to reallocate a buffer with a custom deallocator is + // a programming error. In debug mode, crash here. +#if defined(NANOARROW_DEBUG) + NANOARROW_PRINT_AND_DIE(ENOMEM, + "It is an error to reallocate a buffer whose allocator is " + "ArrowBufferDeallocator()"); +#endif + + // In release mode, ensure the the deallocator is called exactly + // once using the pointer it was given and return NULL, which + // will trigger the caller to return ENOMEM. + allocator->free(allocator, ptr, old_size); + *allocator = ArrowBufferAllocatorDefault(); + return NULL; +} + +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data) { + struct ArrowBufferAllocator allocator; + allocator.reallocate = &ArrowBufferDeallocatorReallocate; + allocator.free = custom_free; + allocator.private_data = private_data; + return allocator; +} + +static const int kInt32DecimalDigits = 9; + +static const uint64_t kUInt32PowersOfTen[] = { + 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, + 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL}; + +// Adapted from Arrow C++ to use 32-bit words for better C portability +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544 +static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) { + // We use strtoll for parsing, which needs input that is null-terminated + char chunk_string[16]; + + for (int64_t posn = 0; posn < value.size_bytes;) { + int64_t remaining = value.size_bytes - posn; + + int64_t group_size; + if (remaining > kInt32DecimalDigits) { + group_size = kInt32DecimalDigits; + } else { + group_size = remaining; + } + + const uint64_t multiple = kUInt32PowersOfTen[group_size]; + + memcpy(chunk_string, value.data + posn, group_size); + chunk_string[group_size] = '\0'; + uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10); + + for (int64_t i = 0; i < out_size; i++) { + uint64_t tmp = out[i]; + tmp *= multiple; + tmp += chunk; + out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL); + chunk = (uint32_t)(tmp >> 32); + } + posn += group_size; + } +} + +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value) { + // Check for sign + int is_negative = value.data[0] == '-'; + int has_sign = is_negative || value.data[0] == '+'; + value.data += has_sign; + value.size_bytes -= has_sign; + + // Check all characters are digits that are not the negative sign + for (int64_t i = 0; i < value.size_bytes; i++) { + char c = value.data[i]; + if (c < '0' || c > '9') { + return EINVAL; + } + } + + // Skip over leading 0s + int64_t n_leading_zeroes = 0; + for (int64_t i = 0; i < value.size_bytes; i++) { + if (value.data[i] == '0') { + n_leading_zeroes++; + } else { + break; + } + } + + value.data += n_leading_zeroes; + value.size_bytes -= n_leading_zeroes; + + // Use 32-bit words for portability + uint32_t words32[8]; + int n_words32 = decimal->n_words * 2; + NANOARROW_DCHECK(n_words32 <= 8); + memset(words32, 0, sizeof(words32)); + + ShiftAndAdd(value, words32, n_words32); + + if (decimal->low_word_index == 0) { + memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32); + } else { + uint64_t lo; + uint64_t hi; + + for (int i = 0; i < decimal->n_words; i++) { + lo = (uint64_t)words32[i * 2]; + hi = (uint64_t)words32[i * 2 + 1] << 32; + decimal->words[decimal->n_words - i - 1] = lo | hi; + } + } + + if (is_negative) { + ArrowDecimalNegate(decimal); + } + + return NANOARROW_OK; +} + +// Adapted from Arrow C++ for C +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer) { + int is_negative = ArrowDecimalSign(decimal) < 0; + + uint64_t words_little_endian[4]; + if (decimal->low_word_index == 0) { + memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t)); + } else { + for (int i = 0; i < decimal->n_words; i++) { + words_little_endian[i] = decimal->words[decimal->n_words - i - 1]; + } + } + + // We've already made a copy, so negate that if needed + if (is_negative) { + uint64_t carry = 1; + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = words_little_endian[i]; + elem = ~elem + carry; + carry &= (elem == 0); + words_little_endian[i] = elem; + } + } + + // Find the most significant word that is non-zero + int most_significant_elem_idx = -1; + for (int i = decimal->n_words - 1; i >= 0; i--) { + if (words_little_endian[i] != 0) { + most_significant_elem_idx = i; + break; + } + } + + // If they are all zero, the output is just '0' + if (most_significant_elem_idx == -1) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0')); + return NANOARROW_OK; + } + + // Define segments such that each segment represents 9 digits with the + // least significant group of 9 digits first. For example, if the input represents + // 9876543210123456789, then segments will be [123456789, 876543210, 9]. + // We handle at most a signed 256 bit integer, whose maximum value occupies 77 + // characters. Thus, we need at most 9 segments. + const uint32_t k1e9 = 1000000000U; + int num_segments = 0; + uint32_t segments[9]; + memset(segments, 0, sizeof(segments)); + uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx; + + do { + // Compute remainder = words_little_endian % 1e9 and words_little_endian = + // words_little_endian / 1e9. + uint32_t remainder = 0; + uint64_t* elem = most_significant_elem; + + do { + // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); + // *elem = dividend / 1e9; + // remainder = dividend % 1e9. + uint32_t hi = (uint32_t)(*elem >> 32); + uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL); + uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi; + uint64_t quotient_hi = dividend_hi / k1e9; + remainder = (uint32_t)(dividend_hi % k1e9); + uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo; + uint64_t quotient_lo = dividend_lo / k1e9; + remainder = (uint32_t)(dividend_lo % k1e9); + + *elem = (quotient_hi << 32) | quotient_lo; + } while (elem-- != words_little_endian); + + segments[num_segments++] = remainder; + } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian); + + // We know our output has no more than 9 digits per segment, plus a negative sign, + // plus any further digits between our output of 9 digits plus enough + // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu + // including a the null terminator) is bounded properly. + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9)); + if (is_negative) { + buffer->data[buffer->size_bytes++] = '-'; + } + + // The most significant segment should have no leading zeroes + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", + (unsigned long)segments[num_segments - 1]); + buffer->size_bytes += n_chars; + + // Subsequent output needs to be left-padded with zeroes such that each segment + // takes up exactly 9 digits. + for (int i = num_segments - 2; i >= 0; i--) { + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu", + (unsigned long)segments[i]); + buffer->size_bytes += n_chars; + NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes); + } + + return NANOARROW_OK; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "nanoarrow.h" + +static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { + if (schema->format != NULL) ArrowFree((void*)schema->format); + if (schema->name != NULL) ArrowFree((void*)schema->name); + if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (schema->children != NULL) { + for (int64_t i = 0; i < schema->n_children; i++) { + if (schema->children[i] != NULL) { + if (schema->children[i]->release != NULL) { + ArrowSchemaRelease(schema->children[i]); + } + + ArrowFree(schema->children[i]); + } + } + + ArrowFree(schema->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (schema->dictionary != NULL) { + if (schema->dictionary->release != NULL) { + ArrowSchemaRelease(schema->dictionary); + } + + ArrowFree(schema->dictionary); + } + + // private data not currently used + if (schema->private_data != NULL) { + ArrowFree(schema->private_data); + } + + schema->release = NULL; +} + +static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_UNINITIALIZED: + return NULL; + case NANOARROW_TYPE_NA: + return "n"; + case NANOARROW_TYPE_BOOL: + return "b"; + + case NANOARROW_TYPE_UINT8: + return "C"; + case NANOARROW_TYPE_INT8: + return "c"; + case NANOARROW_TYPE_UINT16: + return "S"; + case NANOARROW_TYPE_INT16: + return "s"; + case NANOARROW_TYPE_UINT32: + return "I"; + case NANOARROW_TYPE_INT32: + return "i"; + case NANOARROW_TYPE_UINT64: + return "L"; + case NANOARROW_TYPE_INT64: + return "l"; + + case NANOARROW_TYPE_HALF_FLOAT: + return "e"; + case NANOARROW_TYPE_FLOAT: + return "f"; + case NANOARROW_TYPE_DOUBLE: + return "g"; + + case NANOARROW_TYPE_STRING: + return "u"; + case NANOARROW_TYPE_LARGE_STRING: + return "U"; + case NANOARROW_TYPE_BINARY: + return "z"; + case NANOARROW_TYPE_LARGE_BINARY: + return "Z"; + + case NANOARROW_TYPE_DATE32: + return "tdD"; + case NANOARROW_TYPE_DATE64: + return "tdm"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "tiM"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "tiD"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "tin"; + + case NANOARROW_TYPE_LIST: + return "+l"; + case NANOARROW_TYPE_LARGE_LIST: + return "+L"; + case NANOARROW_TYPE_STRUCT: + return "+s"; + case NANOARROW_TYPE_MAP: + return "+m"; + + default: + return NULL; + } +} + +static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, + enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + ArrowSchemaInit(schema->children[0]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); + break; + case NANOARROW_TYPE_MAP: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); + schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); + ArrowSchemaInit(schema->children[0]->children[0]); + ArrowSchemaInit(schema->children[0]->children[1]); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[0], "key")); + schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[1], "value")); + break; + default: + break; + } + + return NANOARROW_OK; +} + +void ArrowSchemaInit(struct ArrowSchema* schema) { + schema->format = NULL; + schema->name = NULL; + schema->metadata = NULL; + schema->flags = ARROW_FLAG_NULLABLE; + schema->n_children = 0; + schema->children = NULL; + schema->dictionary = NULL; + schema->private_data = NULL; + schema->release = &ArrowSchemaReleaseInternal; +} + +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { + // We don't allocate the dictionary because it has to be nullptr + // for non-dictionary-encoded arrays. + + // Set the format to a valid format string for type + const char* template_format = ArrowSchemaFormatTemplate(type); + + // If type isn't recognized and not explicitly unset + if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); + + // For types with an umabiguous child structure, allocate children + return ArrowSchemaInitChildrenIfNeeded(schema, type); +} + +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { + ArrowSchemaInit(schema); + + int result = ArrowSchemaSetType(schema, type); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema); + return result; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size) { + if (fixed_size <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size); + break; + default: + return EINVAL; + } + + buffer[n_chars] = '\0'; + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); + + if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale) { + if (decimal_precision <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_DECIMAL128: + n_chars = + snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); + break; + case NANOARROW_TYPE_DECIMAL256: + n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, + decimal_scale); + break; + default: + return EINVAL; + } + + buffer[n_chars] = '\0'; + return ArrowSchemaSetFormat(schema, buffer); +} + +static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "m"; + case NANOARROW_TIME_UNIT_MICRO: + return "u"; + case NANOARROW_TIME_UNIT_NANO: + return "n"; + default: + return NULL; + } +} + +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone) { + const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); + if (time_unit_str == NULL) { + return EINVAL; + } + + char buffer[128]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_TIME32: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_MICRO: + case NANOARROW_TIME_UNIT_NANO: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; + case NANOARROW_TYPE_TIME64: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + case NANOARROW_TIME_UNIT_MILLI: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; + case NANOARROW_TYPE_TIMESTAMP: + if (timezone == NULL) { + timezone = ""; + } + n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); + break; + case NANOARROW_TYPE_DURATION: + if (timezone != NULL) { + return EINVAL; + } + n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); + break; + default: + return EINVAL; + } + + if (((size_t)n_chars) >= sizeof(buffer)) { + return ERANGE; + } + + buffer[n_chars] = '\0'; + + return ArrowSchemaSetFormat(schema, buffer); +} + +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children) { + if (n_children < 0 || n_children > 127) { + return EINVAL; + } + + // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator + char format_out[512]; + int64_t format_out_size = 512; + memset(format_out, 0, format_out_size); + int n_chars; + char* format_cursor = format_out; + + switch (type) { + case NANOARROW_TYPE_SPARSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+us:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + case NANOARROW_TYPE_DENSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+ud:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + default: + return EINVAL; + } + + if (n_children > 0) { + n_chars = snprintf(format_cursor, format_out_size, "0"); + format_cursor += n_chars; + format_out_size -= n_chars; + + for (int64_t i = 1; i < n_children; i++) { + n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i); + format_cursor += n_chars; + format_out_size -= n_chars; + } + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); + + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { + if (schema->format != NULL) { + ArrowFree((void*)schema->format); + } + + if (format != NULL) { + size_t format_size = strlen(format) + 1; + schema->format = (const char*)ArrowMalloc(format_size); + if (schema->format == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->format, format, format_size); + } else { + schema->format = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { + if (schema->name != NULL) { + ArrowFree((void*)schema->name); + } + + if (name != NULL) { + size_t name_size = strlen(name) + 1; + schema->name = (const char*)ArrowMalloc(name_size); + if (schema->name == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->name, name, name_size); + } else { + schema->name = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { + if (schema->metadata != NULL) { + ArrowFree((void*)schema->metadata); + } + + if (metadata != NULL) { + size_t metadata_size = ArrowMetadataSizeOf(metadata); + schema->metadata = (const char*)ArrowMalloc(metadata_size); + if (schema->metadata == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->metadata, metadata, metadata_size); + } else { + schema->metadata = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children) { + if (schema->children != NULL) { + return EEXIST; + } + + if (n_children > 0) { + schema->children = + (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); + + if (schema->children == NULL) { + return ENOMEM; + } + + schema->n_children = n_children; + + memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); + + for (int64_t i = 0; i < n_children; i++) { + schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + + if (schema->children[i] == NULL) { + return ENOMEM; + } + + schema->children[i]->release = NULL; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { + if (schema->dictionary != NULL) { + return EEXIST; + } + + schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + if (schema->dictionary == NULL) { + return ENOMEM; + } + + schema->dictionary->release = NULL; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, + struct ArrowSchema* schema_out) { + ArrowSchemaInit(schema_out); + + int result = ArrowSchemaSetFormat(schema_out, schema->format); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + schema_out->flags = schema->flags; + + result = ArrowSchemaSetName(schema_out, schema->name); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + result = ArrowSchemaSetMetadata(schema_out, schema->metadata); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowSchemaAllocateDictionary(schema_out); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + } + + return NANOARROW_OK; +} + +static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, + enum ArrowType type) { + schema_view->type = type; + schema_view->storage_type = type; +} + +static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, + const char* format, + const char** format_end_out, + struct ArrowError* error) { + *format_end_out = format; + + // needed for decimal parsing + const char* parse_start; + char* parse_end; + + switch (format[0]) { + case 'n': + schema_view->type = NANOARROW_TYPE_NA; + schema_view->storage_type = NANOARROW_TYPE_NA; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'b': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'c': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'C': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'S': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'i': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'I': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'l': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'L': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'e': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'f': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'g': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); + *format_end_out = format + 1; + return NANOARROW_OK; + + // decimal + case 'd': + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); + return EINVAL; + } + + parse_start = format + 2; + schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start || parse_end[0] != ',') { + ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); + return EINVAL; + } + + parse_start = parse_end + 1; + schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start) { + ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); + return EINVAL; + } else if (parse_end[0] != ',') { + schema_view->decimal_bitwidth = 128; + } else { + parse_start = parse_end + 1; + schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_start == parse_end) { + ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); + return EINVAL; + } + } + + *format_end_out = parse_end; + + switch (schema_view->decimal_bitwidth) { + case 128: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); + return NANOARROW_OK; + case 256: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d", + (int)schema_view->decimal_bitwidth); + return EINVAL; + } + + // validity + data + case 'w': + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':' following 'w'"); + return EINVAL; + } + + schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); + return NANOARROW_OK; + + // validity + offset + data + case 'z': + schema_view->type = NANOARROW_TYPE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'u': + schema_view->type = NANOARROW_TYPE_STRING; + schema_view->storage_type = NANOARROW_TYPE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // validity + large_offset + data + case 'Z': + schema_view->type = NANOARROW_TYPE_LARGE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'U': + schema_view->type = NANOARROW_TYPE_LARGE_STRING; + schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // nested types + case '+': + switch (format[1]) { + // list has validity + offset or offset + case 'l': + schema_view->storage_type = NANOARROW_TYPE_LIST; + schema_view->type = NANOARROW_TYPE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // large list has validity + large_offset or large_offset + case 'L': + schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; + schema_view->type = NANOARROW_TYPE_LARGE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // just validity buffer + case 'w': + if (format[2] != ':' || format[3] == '\0') { + ArrowErrorSet(error, "Expected ':' following '+w'"); + return EINVAL; + } + + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->fixed_size = + (int32_t)strtol(format + 3, (char**)format_end_out, 10); + return NANOARROW_OK; + case 's': + schema_view->storage_type = NANOARROW_TYPE_STRUCT; + schema_view->type = NANOARROW_TYPE_STRUCT; + *format_end_out = format + 2; + return NANOARROW_OK; + case 'm': + schema_view->storage_type = NANOARROW_TYPE_MAP; + schema_view->type = NANOARROW_TYPE_MAP; + *format_end_out = format + 2; + return NANOARROW_OK; + + // unions + case 'u': + switch (format[2]) { + case 'd': + schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; + schema_view->type = NANOARROW_TYPE_DENSE_UNION; + break; + case 's': + schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; + schema_view->type = NANOARROW_TYPE_SPARSE_UNION; + break; + default: + ArrowErrorSet(error, + "Expected union format string +us: or " + "+ud: but found '%s'", + format); + return EINVAL; + } + + if (format[3] == ':') { + schema_view->union_type_ids = format + 4; + int64_t n_type_ids = + _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); + if (n_type_ids != schema_view->schema->n_children) { + ArrowErrorSet( + error, + "Expected union type_ids parameter to be a comma-separated list of %ld " + "values between 0 and 127 but found '%s'", + (long)schema_view->schema->n_children, schema_view->union_type_ids); + return EINVAL; + } + *format_end_out = format + strlen(format); + return NANOARROW_OK; + } else { + ArrowErrorSet(error, + "Expected union format string +us: or +ud: " + "but found '%s'", + format); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Expected nested type format string but found '%s'", + format); + return EINVAL; + } + + // date/time types + case 't': + switch (format[1]) { + // date + case 'd': + switch (format[2]) { + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_DATE32; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DATE64; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", + format + 2); + return EINVAL; + } + + // time of day + case 't': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", + format + 2); + return EINVAL; + } + + // timestamp + case 's': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + break; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + break; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + break; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + break; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", + format + 2); + return EINVAL; + } + + if (format[3] != ':') { + ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, + format + 3); + return EINVAL; + } + + schema_view->timezone = format + 4; + *format_end_out = format + strlen(format); + return NANOARROW_OK; + + // duration + case 'D': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", + format + 2); + return EINVAL; + } + + // interval + case 'i': + switch (format[2]) { + case 'M': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", + format + 2); + return EINVAL; + } + + default: + ArrowErrorSet( + error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", + format + 1); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Unknown format: '%s'", format); + return EINVAL; + } +} + +static ArrowErrorCode ArrowSchemaViewValidateNChildren( + struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { + if (n_children != -1 && schema_view->schema->n_children != n_children) { + ArrowErrorSet(error, "Expected schema with %d children but found %d children", + (int)n_children, (int)schema_view->schema->n_children); + return EINVAL; + } + + // Don't do a full validation of children but do check that they won't + // segfault if inspected + struct ArrowSchema* child; + for (int64_t i = 0; i < schema_view->schema->n_children; i++) { + child = schema_view->schema->children[i]; + if (child == NULL) { + ArrowErrorSet(error, + "Expected valid schema at schema->children[%ld] but found NULL", + (long)i); + return EINVAL; + } else if (child->release == NULL) { + ArrowErrorSet( + error, + "Expected valid schema at schema->children[%ld] but found a released schema", + (long)i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); +} + +static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); + + if (schema_view->schema->children[0]->n_children != 2) { + ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d", + (int)schema_view->schema->children[0]->n_children); + return EINVAL; + } + + if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { + ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", + schema_view->schema->children[0]->format); + return EINVAL; + } + + if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, + "Expected child of map type to be non-nullable but was nullable"); + return EINVAL; + } + + if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateDictionary( + struct ArrowSchemaView* schema_view, struct ArrowError* error) { + // check for valid index type + switch (schema_view->storage_type) { + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + break; + default: + ArrowErrorSet( + error, + "Expected dictionary schema index type to be an integral type but found '%s'", + schema_view->schema->format); + return EINVAL; + } + + struct ArrowSchemaView dictionary_schema_view; + return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, + error); +} + +static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, + enum ArrowType type, + struct ArrowError* error) { + switch (type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_DATE32: + case NANOARROW_TYPE_DATE64: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_TIMESTAMP: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (schema_view->fixed_size <= 0) { + ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", + schema_view->fixed_size); + return EINVAL; + } + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return ArrowSchemaViewValidateNChildren(schema_view, 1, error); + + case NANOARROW_TYPE_STRUCT: + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); + + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return ArrowSchemaViewValidateUnion(schema_view, error); + + case NANOARROW_TYPE_MAP: + return ArrowSchemaViewValidateMap(schema_view, error); + + case NANOARROW_TYPE_DICTIONARY: + return ArrowSchemaViewValidateDictionary(schema_view, error); + + default: + ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", + (int)schema_view->type); + return EINVAL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + const struct ArrowSchema* schema, + struct ArrowError* error) { + if (schema == NULL) { + ArrowErrorSet(error, "Expected non-NULL schema"); + return EINVAL; + } + + if (schema->release == NULL) { + ArrowErrorSet(error, "Expected non-released schema"); + return EINVAL; + } + + schema_view->schema = schema; + + const char* format = schema->format; + if (format == NULL) { + ArrowErrorSet( + error, + "Error parsing schema->format: Expected a null-terminated string but found NULL"); + return EINVAL; + } + + size_t format_len = strlen(format); + if (format_len == 0) { + ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); + return EINVAL; + } + + const char* format_end_out; + int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); + + if (result != NANOARROW_OK) { + if (error != NULL) { + char child_error[1024]; + memcpy(child_error, ArrowErrorMessage(error), 1024); + ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); + } + + return result; + } + + if ((format + format_len) != format_end_out) { + ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters", + format, (int)(format_end_out - format), (int)(format_len)); + return EINVAL; + } + + if (schema->dictionary != NULL) { + schema_view->type = NANOARROW_TYPE_DICTIONARY; + } + + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); + + if (schema_view->storage_type != schema_view->type) { + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->type, error)); + } + + int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; + if (unknown_flags != 0) { + ArrowErrorSet(error, "Unknown ArrowSchema flag"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && + schema_view->type != NANOARROW_TYPE_DICTIONARY) { + ArrowErrorSet(error, + "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && + schema_view->type != NANOARROW_TYPE_MAP) { + ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); + return EINVAL; + } + + ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); + if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { + schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8; + } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + schema_view->layout.child_size_elements = schema_view->fixed_size; + } + + schema_view->extension_name = ArrowCharView(NULL); + schema_view->extension_metadata = ArrowCharView(NULL); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:name"), + &schema_view->extension_name)); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:metadata"), + &schema_view->extension_metadata)); + + return NANOARROW_OK; +} + +static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, + char* out, int64_t n) { + const char* type_string = ArrowTypeString(schema_view->type); + switch (schema_view->type) { + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + return snprintf(out, n, "%s(%d, %d)", type_string, + (int)schema_view->decimal_precision, + (int)schema_view->decimal_scale); + case NANOARROW_TYPE_TIMESTAMP: + return snprintf(out, n, "%s('%s', '%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return snprintf(out, n, "%s('%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit)); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size); + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); + default: + return snprintf(out, n, "%s", type_string); + } +} + +// Helper for bookkeeping to emulate sprintf()-like behaviour spread +// among multiple sprintf calls. +static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, + int64_t* n_remaining, int64_t* n_chars) { + *n_chars += n_chars_last; + *n_remaining -= n_chars_last; + + // n_remaining is never less than 0 + if (*n_remaining < 0) { + *n_remaining = 0; + } + + // Can't do math on a NULL pointer + if (*out != NULL) { + *out += n_chars_last; + } +} + +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, + char recursive) { + if (schema == NULL) { + return snprintf(out, n, "[invalid: pointer is null]"); + } + + if (schema->release == NULL) { + return snprintf(out, n, "[invalid: schema is released]"); + } + + struct ArrowSchemaView schema_view; + struct ArrowError error; + + if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { + return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); + } + + // Extension type and dictionary should include both the top-level type + // and the storage type. + int is_extension = schema_view.extension_name.size_bytes > 0; + int is_dictionary = schema->dictionary != NULL; + int64_t n_chars = 0; + int64_t n_chars_last = 0; + + // Uncommon but not technically impossible that both are true + if (is_extension && is_dictionary) { + n_chars_last = snprintf( + out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); + } else if (is_extension) { + n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data); + } else if (is_dictionary) { + n_chars_last = + snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (!is_dictionary) { + n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); + } else { + n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (recursive && schema->format[0] == '+') { + n_chars_last = snprintf(out, n, "<"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + for (int64_t i = 0; i < schema->n_children; i++) { + if (i > 0) { + n_chars_last = snprintf(out, n, ", "); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + // ArrowSchemaToStringInternal() will validate the child and print the error, + // but we need the name first + if (schema->children[i] != NULL && schema->children[i]->release != NULL && + schema->children[i]->name != NULL) { + n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = snprintf(out, n, ">"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + if (is_extension && is_dictionary) { + n_chars += snprintf(out, n, ">}"); + } else if (is_extension) { + n_chars += snprintf(out, n, "}"); + } else if (is_dictionary) { + n_chars += snprintf(out, n, ">"); + } + + return n_chars; +} + +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata) { + reader->metadata = metadata; + + if (reader->metadata == NULL) { + reader->offset = 0; + reader->remaining_keys = 0; + } else { + memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); + reader->offset = sizeof(int32_t); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out) { + if (reader->remaining_keys <= 0) { + return EINVAL; + } + + int64_t pos = 0; + + int32_t key_size; + memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + key_out->data = reader->metadata + reader->offset + pos; + key_out->size_bytes = key_size; + pos += key_size; + + int32_t value_size; + memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + value_out->data = reader->metadata + reader->offset + pos; + value_out->size_bytes = value_size; + pos += value_size; + + reader->offset += pos; + reader->remaining_keys--; + return NANOARROW_OK; +} + +int64_t ArrowMetadataSizeOf(const char* metadata) { + if (metadata == NULL) { + return 0; + } + + struct ArrowMetadataReader reader; + struct ArrowStringView key; + struct ArrowStringView value; + if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { + return 0; + } + + int64_t size = sizeof(int32_t); + while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { + size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; + } + + return size; +} + +static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, + struct ArrowStringView* key, + struct ArrowStringView* value_out) { + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); + + while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == + NANOARROW_OK) { + int key_equal = key->size_bytes == existing_key.size_bytes && + strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; + if (key_equal) { + value_out->data = existing_value.data; + value_out->size_bytes = existing_value.size_bytes; + break; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out) { + if (value_out == NULL) { + return EINVAL; + } + + return ArrowMetadataGetValueInternal(metadata, &key, value_out); +} + +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { + struct ArrowStringView value = ArrowCharView(NULL); + if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { + return 0; + } + + return value.data != NULL; +} + +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, + const char* metadata) { + ArrowBufferInit(buffer); + return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); +} + +static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + if (value == NULL) { + return NANOARROW_OK; + } + + if (buffer->capacity_bytes == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); + } + + if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { + return EINVAL; + } + + int32_t n_keys; + memcpy(&n_keys, buffer->data, sizeof(int32_t)); + + int32_t key_size = (int32_t)key->size_bytes; + int32_t value_size = (int32_t)value->size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( + buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); + + ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, key->data, key_size); + ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, value->data, value_size); + + n_keys++; + memcpy(buffer->data, &n_keys, sizeof(int32_t)); + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + // Inspect the current value to see if we can avoid copying the buffer + struct ArrowStringView current_value = ArrowCharView(NULL); + NANOARROW_RETURN_NOT_OK( + ArrowMetadataGetValueInternal((const char*)buffer->data, key, ¤t_value)); + + // The key should be removed but no key exists + if (value == NULL && current_value.data == NULL) { + return NANOARROW_OK; + } + + // The key/value can be appended because no key exists + if (value != NULL && current_value.data == NULL) { + return ArrowMetadataBuilderAppendInternal(buffer, key, value); + } + + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); + + struct ArrowBuffer new_buffer; + NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); + + while (reader.remaining_keys > 0) { + int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + + if (key->size_bytes == existing_key.size_bytes && + strncmp((const char*)key->data, (const char*)existing_key.data, + existing_key.size_bytes) == 0) { + result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); + value = NULL; + } else { + result = + ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); + } + + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + } + + ArrowBufferReset(buffer); + ArrowBufferMove(&new_buffer, buffer); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderSetInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key) { + return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "nanoarrow.h" + +static void ArrowArrayReleaseInternal(struct ArrowArray* array) { + // Release buffers held by this array + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + if (private_data != NULL) { + ArrowBitmapReset(&private_data->bitmap); + ArrowBufferReset(&private_data->buffers[0]); + ArrowBufferReset(&private_data->buffers[1]); + ArrowFree(private_data); + } + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (array->children != NULL) { + for (int64_t i = 0; i < array->n_children; i++) { + if (array->children[i] != NULL) { + if (array->children[i]->release != NULL) { + ArrowArrayRelease(array->children[i]); + } + + ArrowFree(array->children[i]); + } + } + + ArrowFree(array->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (array->dictionary != NULL) { + if (array->dictionary->release != NULL) { + ArrowArrayRelease(array->dictionary); + } + + ArrowFree(array->dictionary); + } + + // Mark released + array->release = NULL; +} + +static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, + enum ArrowType storage_type) { + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + array->n_buffers = 0; + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + array->n_buffers = 1; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_MAP: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_DENSE_UNION: + array->n_buffers = 2; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + array->n_buffers = 3; + break; + + default: + return EINVAL; + + return NANOARROW_OK; + } + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->storage_type = storage_type; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type) { + array->length = 0; + array->null_count = 0; + array->offset = 0; + array->n_buffers = 0; + array->n_children = 0; + array->buffers = NULL; + array->children = NULL; + array->dictionary = NULL; + array->release = &ArrowArrayReleaseInternal; + array->private_data = NULL; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); + if (private_data == NULL) { + array->release = NULL; + return ENOMEM; + } + + ArrowBitmapInit(&private_data->bitmap); + ArrowBufferInit(&private_data->buffers[0]); + ArrowBufferInit(&private_data->buffers[1]); + private_data->buffer_data[0] = NULL; + private_data->buffer_data[1] = NULL; + private_data->buffer_data[2] = NULL; + + array->private_data = private_data; + array->buffers = (const void**)(&private_data->buffer_data); + + int result = ArrowArraySetStorageType(array, storage_type); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + ArrowLayoutInit(&private_data->layout, storage_type); + // We can only know this not to be true when initializing based on a schema + // so assume this to be true. + private_data->union_type_id_is_child_index = 1; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; + + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + } + } + + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + const struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); + if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + // We can still build arrays if this isn't true; however, the append + // functions won't work. Instead, we store this value and error only + // when StartAppending is called. + private_data->union_type_id_is_child_index = + _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { + if (array->children != NULL) { + return EINVAL; + } + + if (n_children == 0) { + return NANOARROW_OK; + } + + array->children = + (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); + if (array->children == NULL) { + return ENOMEM; + } + + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); + + for (int64_t i = 0; i < n_children; i++) { + array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->children[i] == NULL) { + return ENOMEM; + } + array->children[i]->release = NULL; + } + + array->n_children = n_children; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { + if (array->dictionary != NULL) { + return EINVAL; + } + + array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->dictionary == NULL) { + return ENOMEM; + } + + array->dictionary->release = NULL; + return NANOARROW_OK; +} + +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); + private_data->bitmap.size_bits = bitmap->size_bits; + bitmap->size_bits = 0; + private_data->buffer_data[0] = private_data->bitmap.buffer.data; + array->null_count = -1; +} + +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (i) { + case 0: + ArrowBufferMove(buffer, &private_data->bitmap.buffer); + private_data->buffer_data[i] = private_data->bitmap.buffer.data; + break; + case 1: + case 2: + ArrowBufferMove(buffer, &private_data->buffers[i - 1]); + private_data->buffer_data[i] = private_data->buffers[i - 1].data; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + ArrowArrayViewInitFromType(array_view, private_data->storage_type); + array_view->layout = private_data->layout; + array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; + + int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < array->n_children; i++) { + result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, + struct ArrowArrayView* array_view) { + // Loop through buffers and reserve the extra space that we know about + for (int64_t i = 0; i < array->n_buffers; i++) { + // Don't reserve on a validity buffer that hasn't been allocated yet + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && + ArrowArrayBuffer(array, i)->data == NULL) { + continue; + } + + int64_t additional_size_bytes = + array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; + + if (additional_size_bytes > 0) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); + } + } + + // Recursively reserve children + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayReserveInternal(array->children[i], array_view->children[i])); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); + + // Calculate theoretical buffer sizes (recursively) + ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); + + // Walk the structure (recursively) + int result = ArrowArrayReserveInternal(array, &array_view); + ArrowArrayViewReset(&array_view); + if (result != NANOARROW_OK) { + return result; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + // The only buffer finalizing this currently does is make sure the data + // buffer for (Large)String|Binary is never NULL + switch (private_data->storage_type) { + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_LARGE_STRING: + if (ArrowArrayBuffer(array, 2)->data == NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0)); + } + break; + default: + break; + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + + return NANOARROW_OK; +} + +static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; + } + + for (int64_t i = 0; i < array->n_children; i++) { + ArrowArrayFlushInternalPointers(array->children[i]); + } + + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); + } +} + +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + // Even if the data buffer is size zero, the pointer value needed to be non-null + // in some implementations (at least one version of Arrow C++ at the time this + // was added). Only do this fix if we can assume CPU data access. + if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); + } + + // Make sure the value we get with array->buffers[i] is set to the actual + // pointer (which may have changed from the original due to reallocation) + ArrowArrayFlushInternalPointers(array); + + if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { + return NANOARROW_OK; + } + + // For validation, initialize an ArrowArrayView with our known buffer sizes + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); + ArrowArrayViewReset(&array_view); + return result; +} + +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error) { + return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); +} + +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type) { + memset(array_view, 0, sizeof(struct ArrowArrayView)); + array_view->storage_type = storage_type; + ArrowLayoutInit(&array_view->layout, storage_type); +} + +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children) { + if (array_view->children != NULL) { + return EINVAL; + } + + array_view->children = + (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); + if (array_view->children == NULL) { + return ENOMEM; + } + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = NULL; + } + + array_view->n_children = n_children; + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->children[i] == NULL) { + return ENOMEM; + } + ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + const struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowSchemaView schema_view; + int result = ArrowSchemaViewInit(&schema_view, schema, error); + if (result != NANOARROW_OK) { + return result; + } + + ArrowArrayViewInitFromType(array_view, schema_view.storage_type); + array_view->layout = schema_view.layout; + + result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); + if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = + ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || + array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { + array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); + if (array_view->union_type_id_map == NULL) { + return ENOMEM; + } + + memset(array_view->union_type_id_map, -1, 256); + int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, + array_view->union_type_id_map + 128); + for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { + int8_t type_id = array_view->union_type_id_map[128 + child_index]; + array_view->union_type_id_map[type_id] = child_index; + } + } + + return NANOARROW_OK; +} + +void ArrowArrayViewReset(struct ArrowArrayView* array_view) { + if (array_view->children != NULL) { + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i] != NULL) { + ArrowArrayViewReset(array_view->children[i]); + ArrowFree(array_view->children[i]); + } + } + + ArrowFree(array_view->children); + } + + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + + if (array_view->union_type_id_map != NULL) { + ArrowFree(array_view->union_type_id_map); + } + + ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); +} + +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + array_view->buffer_views[i].size_bytes = + (length != 0) * element_size_bytes * (length + 1); + continue; + case NANOARROW_BUFFER_TYPE_DATA: + array_view->buffer_views[i].size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / + 8; + continue; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + array_view->buffer_views[i].size_bytes = element_size_bytes * length; + continue; + case NANOARROW_BUFFER_TYPE_NONE: + array_view->buffer_views[i].size_bytes = 0; + continue; + } + } + + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + for (int64_t i = 0; i < array_view->n_children; i++) { + ArrowArrayViewSetLength(array_view->children[i], length); + } + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + if (array_view->n_children >= 1) { + ArrowArrayViewSetLength(array_view->children[0], + length * array_view->layout.child_size_elements); + } + default: + break; + } +} + +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + array_view->array = array; + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; + + int64_t buffers_required = 0; + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + break; + } + + buffers_required++; + + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { + array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; + } + } + + // Check the number of buffers + if (buffers_required != array->n_buffers) { + ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", + (int)buffers_required, (int)array->n_buffers); + return EINVAL; + } + + // Check number of children + if (array_view->n_children != array->n_children) { + ArrowErrorSet(error, "Expected %ld children but found %ld children", + (long)array_view->n_children, (long)array->n_children); + return EINVAL; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + if (array_view->length < 0) { + ArrowErrorSet(error, "Expected length >= 0 but found length %ld", + (long)array_view->length); + return EINVAL; + } + + if (array_view->offset < 0) { + ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", + (long)array_view->offset); + return EINVAL; + } + + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (int)i, + (long)min_buffer_size_bytes, + (long)array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", + ArrowTypeString(array_view->storage_type), + (long)array_view->n_children); + return EINVAL; + } + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)(child_min_length), + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %ld but " + "found array with length %ld", + (long)child_min_length, (long)array_view->children[0]->length); + return EINVAL; + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + int64_t first_offset; + int64_t last_offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } else if (array_view->buffer_views[2].size_bytes == -1) { + // If the data buffer size is unknown and there are no bytes in the offset buffer, + // set the data buffer size to 0. + array_view->buffer_views[2].size_bytes = 0; + } + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } else if (array_view->buffer_views[2].size_bytes == -1) { + // If the data buffer size is unknown and there are no bytes in the offset + // buffer, set the data buffer size to 0. + array_view->buffer_views[2].size_bytes = 0; + } + break; + + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < offset_plus_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)offset_plus_length, + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet( + error, + "Expected child of %s array to have length >= %ld but found array with " + "length %ld", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->children[0]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LARGE_LIST: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet( + error, + "Expected child of large list array to have length >= %ld but found array " + "with length %ld", + (long)last_offset, (long)array_view->children[0]->length); + return EINVAL; + } + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateDefault(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int32_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { + if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int64_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { + if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, + int8_t max_value, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { + ArrowErrorSet(error, + "[%ld] Expected buffer value between %d and %d but found value %d", + (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, + int64_t n_values, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + int item_found = 0; + for (int64_t j = 0; j < n_values; j++) { + if (view.data.as_int8[i] == values[j]) { + item_found = 1; + break; + } + } + + if (!item_found) { + ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, + (int)view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + if (array_view->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK( + ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); + } else { + NANOARROW_RETURN_NOT_OK( + ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); + } + break; + default: + break; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { + if (array_view->union_type_id_map == NULL) { + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; + } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( + array_view->union_type_id_map, array_view->n_children, + array_view->n_children)) { + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); + } else { + NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], + array_view->union_type_id_map + 128, + array_view->n_children, error)); + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && + array_view->union_type_id_map != NULL) { + // Check that offsets refer to child elements that actually exist + for (int64_t i = 0; i < array_view->length; i++) { + int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); + int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); + int64_t child_length = array_view->children[child_id]->length; + if (offset < 0 || offset > child_length) { + ArrowErrorSet( + error, + "[%ld] Expected union offset for child id %d to be between 0 and %ld but " + "found offset value %ld", + (long)i, (int)child_id, (long)child_length, (long)offset); + return EINVAL; + } + } + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); + } + + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); + // TODO: validate the indices + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "nanoarrow.h" + +struct BasicArrayStreamPrivate { + struct ArrowSchema schema; + int64_t n_arrays; + struct ArrowArray* arrays; + int64_t arrays_i; +}; + +static int ArrowBasicArrayStreamGetSchema(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + return ArrowSchemaDeepCopy(&private_data->schema, schema); +} + +static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, + struct ArrowArray* array) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->arrays_i == private_data->n_arrays) { + array->release = NULL; + return NANOARROW_OK; + } + + ArrowArrayMove(&private_data->arrays[private_data->arrays_i++], array); + return NANOARROW_OK; +} + +static const char* ArrowBasicArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + NANOARROW_UNUSED(array_stream); + return NULL; +} + +static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) { + if (array_stream == NULL || array_stream->release == NULL) { + return; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->schema.release != NULL) { + ArrowSchemaRelease(&private_data->schema); + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + ArrowArrayRelease(&private_data->arrays[i]); + } + } + + if (private_data->arrays != NULL) { + ArrowFree(private_data->arrays); + } + + ArrowFree(private_data); + array_stream->release = NULL; +} + +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); + if (private_data == NULL) { + return ENOMEM; + } + + ArrowSchemaMove(schema, &private_data->schema); + + private_data->n_arrays = n_arrays; + private_data->arrays = NULL; + private_data->arrays_i = 0; + + if (n_arrays > 0) { + private_data->arrays = + (struct ArrowArray*)ArrowMalloc(n_arrays * sizeof(struct ArrowArray)); + if (private_data->arrays == NULL) { + ArrowBasicArrayStreamRelease(array_stream); + return ENOMEM; + } + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + private_data->arrays[i].release = NULL; + } + + array_stream->get_schema = &ArrowBasicArrayStreamGetSchema; + array_stream->get_next = &ArrowBasicArrayStreamGetNext; + array_stream->get_last_error = ArrowBasicArrayStreamGetLastError; + array_stream->release = ArrowBasicArrayStreamRelease; + array_stream->private_data = private_data; + return NANOARROW_OK; +} + +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + ArrowArrayMove(array, &private_data->arrays[i]); +} + +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, + struct ArrowError* error) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewInitFromSchema(&array_view, &private_data->schema, error)); + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + int result = ArrowArrayViewSetArray(&array_view, &private_data->arrays[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(&array_view); + return result; + } + } + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} diff --git a/libtiledbsoma/src/utils/nanoarrow.h b/libtiledbsoma/src/utils/nanoarrow.h new file mode 100644 index 0000000000..8d62ac64fd --- /dev/null +++ b/libtiledbsoma/src/utils/nanoarrow.h @@ -0,0 +1,3734 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUILD_ID_H_INCLUDED +#define NANOARROW_BUILD_ID_H_INCLUDED + +#define NANOARROW_VERSION_MAJOR 0 +#define NANOARROW_VERSION_MINOR 5 +#define NANOARROW_VERSION_PATCH 0 +#define NANOARROW_VERSION "0.5.0-SNAPSHOT" + +#define NANOARROW_VERSION_INT \ + (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ + NANOARROW_VERSION_PATCH) + +// #define NANOARROW_NAMESPACE YourNamespaceHere + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED +#define NANOARROW_NANOARROW_TYPES_H_INCLUDED + +#include +#include + + + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Extra guard for versions of Arrow without the canonical guard +#ifndef ARROW_FLAG_DICTIONARY_ORDERED + +/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface +/// +/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) +/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) +/// interfaces are part of the +/// Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for +/// documentation of these structures. +/// +/// @{ + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE +#endif // ARROW_FLAG_DICTIONARY_ORDERED + +/// @} + +// Utility macros +#define _NANOARROW_CONCAT(x, y) x##y +#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) + +#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) return NAME; \ + } while (0) + +#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) + +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ + NAME, __FILE__, __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +// For checking ArrowErrorSet() calls for valid printf format strings/arguments +// If using mingw's c99-compliant printf, we need a different format-checking attribute +#if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ + __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) +#elif defined(__GNUC__) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) +#else +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +// For checking calls to functions that return ArrowErrorCode +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE __attribute__((warn_unused_result)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1700) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE _Check_return_ +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#endif + +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +#define NANOARROW_UNUSED(x) (void)(x) + +/// \brief Return code for success. +/// \ingroup nanoarrow-errors +#define NANOARROW_OK 0 + +/// \brief Represents an errno-compatible error code +/// \ingroup nanoarrow-errors +typedef int ArrowErrorCode; + +#if defined(NANOARROW_DEBUG) +#define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode +#endif + +/// \brief Flags supported by ArrowSchemaViewInit() +/// \ingroup nanoarrow-schema-view +#define NANOARROW_FLAG_ALL_SUPPORTED \ + (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED) + +/// \brief Error type containing a UTF-8 encoded message. +/// \ingroup nanoarrow-errors +struct ArrowError { + /// \brief A character buffer with space for an error message. + char message[1024]; +}; + +/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error != NULL) { + error->message[0] = '\0'; + } +} + +/// \brief Get the contents of an error +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. +static inline const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} + +/// \brief Set the contents of an error from an existing null-terminated string +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorSetString(struct ArrowError* error, const char* src) { + if (error == NULL) { + return; + } + + int64_t src_len = strlen(src); + if (src_len >= ((int64_t)sizeof(error->message))) { + memcpy(error->message, src, sizeof(error->message) - 1); + error->message[sizeof(error->message) - 1] = '\0'; + } else { + memcpy(error->message, src, src_len); + error->message[src_len] = '\0'; + } +} + +/// \brief Check the result of an expression and return it if not NANOARROW_OK +/// \ingroup nanoarrow-errors +#define NANOARROW_RETURN_NOT_OK(EXPR) \ + _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) + +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when calling +/// a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf(stderr, "%s failed with code %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ + __FILE__, (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), +/// print a message to stderr and abort. If nanoarrow was built in release mode, +/// this statement has no effect. You can customize fatal error behaviour +/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h +/// This macro is provided as a convenience for users and is not used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) + +#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ + do { \ + if (!(EXPR)) NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ + } while (0) + +#define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) EXPR +#define NANOARROW_DCHECK(EXPR) +#endif + +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; +} + +static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + NANOARROW_DCHECK(schema != NULL); + schema->release(schema); + NANOARROW_DCHECK(schema->release == NULL); +} + +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; +} + +static inline void ArrowArrayRelease(struct ArrowArray* array) { + NANOARROW_DCHECK(array != NULL); + array->release(array); + NANOARROW_DCHECK(array->release == NULL); +} + +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; +} + +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + + const char* value = array_stream->get_last_error(array_stream); + if (value == NULL) { + return ""; + } else { + return value; + } +} + +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_schema(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_next(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + array_stream->release(array_stream); + NANOARROW_DCHECK(array_stream->release == NULL); +} + +static char _ArrowIsLittleEndian(void) { + uint32_t check = 1; + char first_byte; + memcpy(&first_byte, &check, sizeof(char)); + return first_byte; +} + +/// \brief Arrow type enumerator +/// \ingroup nanoarrow-utils +/// +/// These names are intended to map to the corresponding arrow::Type::type +/// enumerator; however, the numeric values are specifically not equal +/// (i.e., do not rely on numeric comparison). +enum ArrowType { + NANOARROW_TYPE_UNINITIALIZED = 0, + NANOARROW_TYPE_NA = 1, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_UINT8, + NANOARROW_TYPE_INT8, + NANOARROW_TYPE_UINT16, + NANOARROW_TYPE_INT16, + NANOARROW_TYPE_UINT32, + NANOARROW_TYPE_INT32, + NANOARROW_TYPE_UINT64, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_HALF_FLOAT, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_BINARY, + NANOARROW_TYPE_FIXED_SIZE_BINARY, + NANOARROW_TYPE_DATE32, + NANOARROW_TYPE_DATE64, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_INTERVAL_MONTHS, + NANOARROW_TYPE_INTERVAL_DAY_TIME, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256, + NANOARROW_TYPE_LIST, + NANOARROW_TYPE_STRUCT, + NANOARROW_TYPE_SPARSE_UNION, + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_DICTIONARY, + NANOARROW_TYPE_MAP, + NANOARROW_TYPE_EXTENSION, + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_LARGE_STRING, + NANOARROW_TYPE_LARGE_BINARY, + NANOARROW_TYPE_LARGE_LIST, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO +}; + +/// \brief Get a string value of an enum ArrowType value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + +static inline const char* ArrowTypeString(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_NA: + return "na"; + case NANOARROW_TYPE_BOOL: + return "bool"; + case NANOARROW_TYPE_UINT8: + return "uint8"; + case NANOARROW_TYPE_INT8: + return "int8"; + case NANOARROW_TYPE_UINT16: + return "uint16"; + case NANOARROW_TYPE_INT16: + return "int16"; + case NANOARROW_TYPE_UINT32: + return "uint32"; + case NANOARROW_TYPE_INT32: + return "int32"; + case NANOARROW_TYPE_UINT64: + return "uint64"; + case NANOARROW_TYPE_INT64: + return "int64"; + case NANOARROW_TYPE_HALF_FLOAT: + return "half_float"; + case NANOARROW_TYPE_FLOAT: + return "float"; + case NANOARROW_TYPE_DOUBLE: + return "double"; + case NANOARROW_TYPE_STRING: + return "string"; + case NANOARROW_TYPE_BINARY: + return "binary"; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return "fixed_size_binary"; + case NANOARROW_TYPE_DATE32: + return "date32"; + case NANOARROW_TYPE_DATE64: + return "date64"; + case NANOARROW_TYPE_TIMESTAMP: + return "timestamp"; + case NANOARROW_TYPE_TIME32: + return "time32"; + case NANOARROW_TYPE_TIME64: + return "time64"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "interval_months"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "interval_day_time"; + case NANOARROW_TYPE_DECIMAL128: + return "decimal128"; + case NANOARROW_TYPE_DECIMAL256: + return "decimal256"; + case NANOARROW_TYPE_LIST: + return "list"; + case NANOARROW_TYPE_STRUCT: + return "struct"; + case NANOARROW_TYPE_SPARSE_UNION: + return "sparse_union"; + case NANOARROW_TYPE_DENSE_UNION: + return "dense_union"; + case NANOARROW_TYPE_DICTIONARY: + return "dictionary"; + case NANOARROW_TYPE_MAP: + return "map"; + case NANOARROW_TYPE_EXTENSION: + return "extension"; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return "fixed_size_list"; + case NANOARROW_TYPE_DURATION: + return "duration"; + case NANOARROW_TYPE_LARGE_STRING: + return "large_string"; + case NANOARROW_TYPE_LARGE_BINARY: + return "large_binary"; + case NANOARROW_TYPE_LARGE_LIST: + return "large_list"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "interval_month_day_nano"; + default: + return NULL; + } +} + +/// \brief Arrow time unit enumerator +/// \ingroup nanoarrow-utils +/// +/// These names and values map to the corresponding arrow::TimeUnit::type +/// enumerator. +enum ArrowTimeUnit { + NANOARROW_TIME_UNIT_SECOND = 0, + NANOARROW_TIME_UNIT_MILLI = 1, + NANOARROW_TIME_UNIT_MICRO = 2, + NANOARROW_TIME_UNIT_NANO = 3 +}; + +/// \brief Validation level enumerator +/// \ingroup nanoarrow-array +enum ArrowValidationLevel { + /// \brief Do not validate buffer sizes or content. + NANOARROW_VALIDATION_LEVEL_NONE = 0, + + /// \brief Validate buffer sizes that depend on array length but do not validate buffer + /// sizes that depend on buffer data access. + NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, + + /// \brief Validate all buffer sizes, including those that require buffer data access, + /// but do not perform any checks that are O(1) along the length of the buffers. + NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, + + /// \brief Validate all buffer sizes and all buffer content. This is useful in the + /// context of untrusted input or input that may have been corrupted in transit. + NANOARROW_VALIDATION_LEVEL_FULL = 3 +}; + +/// \brief Get a string value of an enum ArrowTimeUnit value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "ms"; + case NANOARROW_TIME_UNIT_MICRO: + return "us"; + case NANOARROW_TIME_UNIT_NANO: + return "ns"; + default: + return NULL; + } +} + +/// \brief Functional types of buffers as described in the Arrow Columnar Specification +/// \ingroup nanoarrow-array-view +enum ArrowBufferType { + NANOARROW_BUFFER_TYPE_NONE, + NANOARROW_BUFFER_TYPE_VALIDITY, + NANOARROW_BUFFER_TYPE_TYPE_ID, + NANOARROW_BUFFER_TYPE_UNION_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA +}; + +/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \ingroup nanoarrow-array-view +/// +/// All currently supported types have 3 buffers or fewer; however, future types +/// may involve a variable number of buffers (e.g., string view). These buffers +/// will be represented by separate members of the ArrowArrayView or ArrowLayout. +#define NANOARROW_MAX_FIXED_BUFFERS 3 + +/// \brief An non-owning view of a string +/// \ingroup nanoarrow-utils +struct ArrowStringView { + /// \brief A pointer to the start of the string + /// + /// If size_bytes is 0, this value may be NULL. + const char* data; + + /// \brief The size of the string in bytes, + /// + /// (Not including the null terminator.) + int64_t size_bytes; +}; + +/// \brief Return a view of a const C string +/// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + +static inline struct ArrowStringView ArrowCharView(const char* value) { + struct ArrowStringView out; + + out.data = value; + if (value) { + out.size_bytes = (int64_t)strlen(value); + } else { + out.size_bytes = 0; + } + + return out; +} + +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; +}; + +/// \brief An non-owning view of a buffer +/// \ingroup nanoarrow-utils +struct ArrowBufferView { + /// \brief A pointer to the start of the buffer + /// + /// If size_bytes is 0, this value may be NULL. + union ArrowBufferViewData data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; +}; + +/// \brief Array buffer allocation and deallocation +/// \ingroup nanoarrow-buffer +/// +/// Container for allocate, reallocate, and free methods that can be used +/// to customize allocation and deallocation of buffers when constructing +/// an ArrowArray. +struct ArrowBufferAllocator { + /// \brief Reallocate a buffer or return NULL if it cannot be reallocated + uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t old_size, int64_t new_size); + + /// \brief Deallocate a buffer allocated by this allocator + void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); + + /// \brief Opaque data specific to the allocator + void* private_data; +}; + +typedef void (*ArrowBufferDeallocatorCallback)(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size); + +/// \brief An owning mutable view of a buffer +/// \ingroup nanoarrow-buffer +struct ArrowBuffer { + /// \brief A pointer to the start of the buffer + /// + /// If capacity_bytes is 0, this value may be NULL. + uint8_t* data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; + + /// \brief The capacity of the buffer in bytes + int64_t capacity_bytes; + + /// \brief The allocator that will be used to reallocate and/or free the buffer + struct ArrowBufferAllocator allocator; +}; + +/// \brief An owning mutable view of a bitmap +/// \ingroup nanoarrow-bitmap +struct ArrowBitmap { + /// \brief An ArrowBuffer to hold the allocated memory + struct ArrowBuffer buffer; + + /// \brief The number of bits that have been appended to the bitmap + int64_t size_bits; +}; + +/// \brief A description of an arrangement of buffers +/// \ingroup nanoarrow-utils +/// +/// Contains the minimum amount of information required to +/// calculate the size of each buffer in an ArrowArray knowing only +/// the length and offset of the array. +struct ArrowLayout { + /// \brief The function of each buffer + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The size of an element each buffer or 0 if this size is variable or unknown + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The number of elements in the child array per element in this array for a + /// fixed-size list + int64_t child_size_elements; +}; + +/// \brief A non-owning view of an ArrowArray +/// \ingroup nanoarrow-array-view +/// +/// This data structure provides access to the values contained within +/// an ArrowArray with fields provided in a more readily-extractible +/// form. You can re-use an ArrowArrayView for multiple ArrowArrays +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. +struct ArrowArrayView { + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + const struct ArrowArray* array; + + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; + + /// \brief The type used to store values in this array + /// + /// This type represents only the minimum required information to + /// extract values from the array buffers (e.g., for a Date32 array, + /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded + /// arrays, this will be the index type. + enum ArrowType storage_type; + + /// \brief The buffer types, strides, and sizes of this Array's buffers + struct ArrowLayout layout; + + /// \brief This Array's buffers as ArrowBufferView objects + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The number of children of this view + int64_t n_children; + + /// \brief Pointers to views of this array's children + struct ArrowArrayView** children; + + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + + /// \brief Union type id to child index mapping + /// + /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer + /// such that child_index == union_type_id_map[type_id] and + /// type_id == union_type_id_map[128 + child_index]. This value may be + /// NULL in the case where child_id == type_id. + int8_t* union_type_id_map; +}; + +// Used as the private data member for ArrowArrays allocated here and accessed +// internally within inline ArrowArray* helpers. +struct ArrowArrayPrivateData { + // Holder for the validity buffer (or first buffer for union types, which are + // the only type whose first buffer is not a valdiity buffer) + struct ArrowBitmap bitmap; + + // Holder for additional buffers as required + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; + + // The array of pointers to buffers. This must be updated after a sequence + // of appends to synchronize its values with the actual buffer addresses + // (which may have ben reallocated uring that time) + const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; + + // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown + enum ArrowType storage_type; + + // The buffer arrangement for the storage type + struct ArrowLayout layout; + + // Flag to indicate if there are non-sequence union type ids. + // In the future this could be replaced with a type id<->child mapping + // to support constructing unions in append mode where type_id != child_index + int8_t union_type_id_is_child_index; +}; + +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit(struct ArrowInterval* interval, + enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + +/// \brief A representation of a fixed-precision decimal number +/// \ingroup nanoarrow-utils +/// +/// This structure should be initialized with ArrowDecimalInit() once and +/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), +/// or ArrowDecimalSetBytes256(). +struct ArrowDecimal { + /// \brief An array of 64-bit integers of n_words length defined in native-endian order + uint64_t words[4]; + + /// \brief The number of significant digits this decimal number can represent + int32_t precision; + + /// \brief The number of digits after the decimal point. This can be negative. + int32_t scale; + + /// \brief The number of words in the words array + int n_words; + + /// \brief Cached value used by the implementation + int high_word_index; + + /// \brief Cached value used by the implementation + int low_word_index; +}; + +/// \brief Initialize a decimal with a given set of type parameters +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, + int32_t precision, int32_t scale) { + memset(decimal->words, 0, sizeof(decimal->words)); + decimal->precision = precision; + decimal->scale = scale; + decimal->n_words = bitwidth / 8 / sizeof(uint64_t); + + if (_ArrowIsLittleEndian()) { + decimal->low_word_index = 0; + decimal->high_word_index = decimal->n_words - 1; + } else { + decimal->low_word_index = decimal->n_words - 1; + decimal->high_word_index = 0; + } +} + +/// \brief Get a signed integer value of a sufficiently small ArrowDecimal +/// +/// This does not check if the decimal's precision sufficiently small to fit +/// within the signed 64-bit integer range (A precision less than or equal +/// to 18 is sufficiently small). +static inline int64_t ArrowDecimalGetIntUnsafe(const struct ArrowDecimal* decimal) { + return (int64_t)decimal->words[decimal->low_word_index]; +} + +/// \brief Copy the bytes of this decimal into a sufficiently large buffer +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalGetBytes(const struct ArrowDecimal* decimal, + uint8_t* out) { + memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); +} + +/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise +/// \ingroup nanoarrow-utils +static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { + return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); +} + +/// \brief Sets the integer value of this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { + if (value < 0) { + memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); + } else { + memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); + } + + decimal->words[decimal->low_word_index] = value; +} + +/// \brief Negate the value of this decimal in place +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { + uint64_t carry = 1; + + if (decimal->low_word_index == 0) { + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } else { + for (int i = decimal->low_word_index; i >= 0; i--) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } +} + +/// \brief Copy bytes from a buffer into this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, + const uint8_t* value) { + memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_H_INCLUDED +#define NANOARROW_H_INCLUDED + +#include +#include +#include + + + +// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this +// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE +// MyNamespace here. + +// This section remaps the non-prefixed symbols to the prefixed symbols so that +// code written against this build can be used independent of the value of +// NANOARROW_NAMESPACE. +#ifdef NANOARROW_NAMESPACE +#define NANOARROW_CAT(A, B) A##B +#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) + +#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) +#define ArrowNanoarrowVersionInt \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) +#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) +#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) +#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) +#define ArrowBufferAllocatorDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) +#define ArrowBufferDeallocator \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) +#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) +#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) +#define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) +#define ArrowDecimalAppendDigitsToBuffer \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) +#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) +#define ArrowSchemaInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) +#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) +#define ArrowSchemaSetTypeStruct \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) +#define ArrowSchemaSetTypeFixedSize \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) +#define ArrowSchemaSetTypeDecimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) +#define ArrowSchemaSetTypeDateTime \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) +#define ArrowSchemaSetTypeUnion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) +#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) +#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) +#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) +#define ArrowSchemaSetMetadata \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) +#define ArrowSchemaAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) +#define ArrowSchemaAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) +#define ArrowMetadataReaderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) +#define ArrowMetadataReaderRead \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) +#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) +#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) +#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) +#define ArrowMetadataBuilderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) +#define ArrowMetadataBuilderAppend \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) +#define ArrowMetadataBuilderSet \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) +#define ArrowMetadataBuilderRemove \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) +#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) +#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) +#define ArrowArrayInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) +#define ArrowArrayInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) +#define ArrowArrayAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) +#define ArrowArraySetValidityBitmap \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) +#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) +#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) +#define ArrowArrayFinishBuilding \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) +#define ArrowArrayFinishBuildingDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) +#define ArrowArrayViewInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) +#define ArrowArrayViewInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) +#define ArrowArrayViewAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) +#define ArrowArrayViewSetLength \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) +#define ArrowArrayViewSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) +#define ArrowBasicArrayStreamInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) +#define ArrowBasicArrayStreamSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) +#define ArrowBasicArrayStreamValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/// \defgroup nanoarrow Nanoarrow C library +/// +/// Except where noted, objects are not thread-safe and clients should +/// take care to serialize accesses to methods. +/// +/// Because this library is intended to be vendored, it provides full type +/// definitions and encourages clients to stack or statically allocate +/// where convenient. + +/// \defgroup nanoarrow-malloc Memory management +/// +/// Non-buffer members of a struct ArrowSchema and struct ArrowArray +/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed +/// using ArrowFree() for schemas and arrays allocated here. Buffer members +/// are allocated using an ArrowBufferAllocator. +/// +/// @{ + +/// \brief Allocate like malloc() +void* ArrowMalloc(int64_t size); + +/// \brief Reallocate like realloc() +void* ArrowRealloc(void* ptr, int64_t size); + +/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). +void ArrowFree(void* ptr); + +/// \brief Return the default allocator +/// +/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and +/// ArrowFree(). +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); + +/// \brief Create a custom deallocator +/// +/// Creates a buffer allocator with only a free method that can be used to +/// attach a custom deallocator to an ArrowBuffer. This may be used to +/// avoid copying an existing buffer that was not allocated using the +/// infrastructure provided here (e.g., by an R or Python object). +struct ArrowBufferAllocator ArrowBufferDeallocator(ArrowBufferDeallocatorCallback, + void* private_data); + +/// @} + +/// \brief Move the contents of an src ArrowSchema into dst and set src->release to NULL +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst); + +/// \brief Call the release callback of an ArrowSchema +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaRelease(struct ArrowSchema* schema); + +/// \brief Move the contents of an src ArrowArray into dst and set src->release to NULL +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst); + +/// \brief Call the release callback of an ArrowArray +static inline void ArrowArrayRelease(struct ArrowArray* array); + +/// \brief Move the contents of an src ArrowArrayStream into dst and set src->release to +/// NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_schema callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowError* error); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowError* error); + +/// \brief Call the get_next callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this function never returns NULL (i.e., its +/// result is safe to use in printf-style error formatters). Null values from the +/// original callback are reported as "". +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream); + +/// \brief Call the release callback of an ArrowArrayStream +static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream); + +/// \defgroup nanoarrow-errors Error handling +/// +/// Functions generally return an errno-compatible error code; functions that +/// need to communicate more verbose error information accept a pointer +/// to an ArrowError. This can be stack or statically allocated. The +/// content of the message is undefined unless an error code has been +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the +/// ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and +/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use +/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms +/// for memory management and error propgagtion. +/// +/// @{ + +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. +NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet(struct ArrowError* error, + const char* fmt, ...); + +/// @} + +/// \defgroup nanoarrow-utils Utility data structures +/// +/// @{ + +/// \brief Return a version string in the form "major.minor.patch" +const char* ArrowNanoarrowVersion(void); + +/// \brief Return an integer that can be used to compare versions sequentially +int ArrowNanoarrowVersionInt(void); + +/// \brief Initialize a description of buffer arrangements from a storage type +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); + +/// \brief Create a string view from a null-terminated string +static inline struct ArrowStringView ArrowCharView(const char* value); + +/// \brief Sets the integer value of an ArrowDecimal from a string +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value); + +/// \brief Get the integer value of an ArrowDecimal as string +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer); + +/// @} + +/// \defgroup nanoarrow-schema Creating schemas +/// +/// These functions allocate, copy, and destroy ArrowSchema structures +/// +/// @{ + +/// \brief Initialize an ArrowSchema +/// +/// Initializes the fields and release callback of schema_out. Caller +/// is responsible for calling the schema->release callback if +/// NANOARROW_OK is returned. +void ArrowSchemaInit(struct ArrowSchema* schema); + +/// \brief Initialize an ArrowSchema from an ArrowType +/// +/// A convenience constructor for that calls ArrowSchemaInit() and +/// ArrowSchemaSetType() for the common case of constructing an +/// unparameterized type. The caller is responsible for calling the schema->release +/// callback if NANOARROW_OK is returned. +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Get a human-readable summary of a Schema +/// +/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) +/// and returns the number of characters required for the output if +/// n were sufficiently large. If recursive is non-zero, the result will +/// also include children. +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, + char recursive); + +/// \brief Set the format field of a schema from an ArrowType +/// +/// Initializes the fields and release callback of schema_out. For +/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and +/// NANOARROW_TYPE_MAP, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized +/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Set the format field and initialize children of a struct schema +/// +/// The specified number of children are initialized; however, the caller is responsible +/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. +/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); + +/// \brief Set the format field of a fixed-size schema +/// +/// Returns EINVAL for fixed_size <= 0 or for type that is not +/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. +/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() the first child. Schema must have been initialized using +/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size); + +/// \brief Set the format field of a decimal schema +/// +/// Returns EINVAL for scale <= 0 or for type that is not +/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale); + +/// \brief Set the format field of a time, timestamp, or duration schema +/// +/// Returns EINVAL for type that is not +/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, +/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The +/// timezone parameter must be NULL for a non-timestamp type. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone); + +/// \brief Seet the format field of a union schema +/// +/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION +/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are +/// allocated, and initialized. +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children); + +/// \brief Make a (recursive) copy of a schema +/// +/// Allocates and copies fields of schema into schema_out. +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, + struct ArrowSchema* schema_out); + +/// \brief Copy format into schema->format +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); + +/// \brief Copy name into schema->name +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); + +/// \brief Copy metadata into schema->metadata +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy. +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); + +/// \brief Allocate the schema->children array +/// +/// Includes the memory for each child struct ArrowSchema. +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children); + +/// \brief Allocate the schema->dictionary member +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); + +/// @} + +/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata +/// +/// @{ + +/// \brief Reader for key/value pairs in schema metadata +/// +/// The ArrowMetadataReader does not own any data and is only valid +/// for the lifetime of the underlying metadata pointer. +struct ArrowMetadataReader { + /// \brief A metadata string from a schema->metadata field. + const char* metadata; + + /// \brief The current offset into the metadata string + int64_t offset; + + /// \brief The number of remaining keys + int32_t remaining_keys; +}; + +/// \brief Initialize an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata); + +/// \brief Read the next key/value pair from an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out); + +/// \brief The number of bytes in in a key/value metadata string +int64_t ArrowMetadataSizeOf(const char* metadata); + +/// \brief Check for a key in schema metadata +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); + +/// \brief Extract a value from schema metadata +/// +/// If key does not exist in metadata, value_out is unmodified +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out); + +/// \brief Initialize a builder for schema metadata from key/value pairs +/// +/// metadata can be an existing metadata string or NULL to initialize +/// an empty metadata string. +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); + +/// \brief Append a key/value pair to a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Set a key/value pair to a buffer containing serialized metadata +/// +/// Ensures that the only entry for key in the metadata is set to value. +/// This function maintains the existing position of (the first instance of) +/// key if present in the data. +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Remove a key from a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key); + +/// @} + +/// \defgroup nanoarrow-schema-view Reading schemas +/// +/// @{ + +/// \brief A non-owning view of a parsed ArrowSchema +/// +/// Contains more readily extractable values than a raw ArrowSchema. +/// Clients can stack or statically allocate this structure but are +/// encouraged to use the provided getters to ensure forward +/// compatibility. +struct ArrowSchemaView { + /// \brief A pointer to the schema represented by this view + const struct ArrowSchema* schema; + + /// \brief The data type represented by the schema + /// + /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a + /// non-null dictionary member; datetime types are valid values. + /// This value will never be NANOARROW_TYPE_EXTENSION (see + /// extension_name and/or extension_metadata to check for + /// an extension type). + enum ArrowType type; + + /// \brief The storage data type represented by the schema + /// + /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION + /// or any datetime type. This value represents only the type required to + /// interpret the buffers in the array. + enum ArrowType storage_type; + + /// \brief The storage layout represented by the schema + struct ArrowLayout layout; + + /// \brief The extension type name if it exists + /// + /// If the ARROW:extension:name key is present in schema.metadata, + /// extension_name.data will be non-NULL. + struct ArrowStringView extension_name; + + /// \brief The extension type metadata if it exists + /// + /// If the ARROW:extension:metadata key is present in schema.metadata, + /// extension_metadata.data will be non-NULL. + struct ArrowStringView extension_metadata; + + /// \brief Format fixed size parameter + /// + /// This value is set when parsing a fixed-size binary or fixed-size + /// list schema; this value is undefined for other types. For a + /// fixed-size binary schema this value is in bytes; for a fixed-size + /// list schema this value refers to the number of child elements for + /// each element of the parent. + int32_t fixed_size; + + /// \brief Decimal bitwidth + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_bitwidth; + + /// \brief Decimal precision + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_precision; + + /// \brief Decimal scale + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_scale; + + /// \brief Format time unit parameter + /// + /// This value is set when parsing a date/time type. The value is + /// undefined for other types. + enum ArrowTimeUnit time_unit; + + /// \brief Format timezone parameter + /// + /// This value is set when parsing a timestamp type and represents + /// the timezone format parameter. This value points to + /// data within the schema and is undefined for other types. + const char* timezone; + + /// \brief Union type ids parameter + /// + /// This value is set when parsing a union type and represents + /// type ids parameter. This value points to + /// data within the schema and is undefined for other types. + const char* union_type_ids; +}; + +/// \brief Initialize an ArrowSchemaView +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-buffer Owning, growable buffers +/// +/// @{ + +/// \brief Initialize an ArrowBuffer +/// +/// Initialize a buffer with a NULL, zero-size buffer using the default +/// buffer allocator. +static inline void ArrowBufferInit(struct ArrowBuffer* buffer); + +/// \brief Set a newly-initialized buffer's allocator +/// +/// Returns EINVAL if the buffer has already been allocated. +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); + +/// \brief Reset an ArrowBuffer +/// +/// Releases the buffer using the allocator's free method if +/// the buffer's data member is non-null, sets the data member +/// to NULL, and sets the buffer's size and capacity to 0. +static inline void ArrowBufferReset(struct ArrowBuffer* buffer); + +/// \brief Move an ArrowBuffer +/// +/// Transfers the buffer data and lifecycle management to another +/// address and resets buffer. +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); + +/// \brief Grow or shrink a buffer to a given capacity +/// +/// When shrinking the capacity of the buffer, the buffer is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not +/// adjust the buffer's size member except to ensure that the invariant +/// capacity >= size remains true. +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit); + +/// \brief Ensure a buffer has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bytes, overallocating when required. +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function does not check that buffer has the required capacity +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function writes and ensures that the buffer has the required capacity, +/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will +/// overallocate when reallocation is required. +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes); + +/// \brief Write fill to buffer and increment the buffer size +/// +/// This function writes the specified number of fill bytes and +/// ensures that the buffer has the required capacity, +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes); + +/// \brief Write an 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value); + +/// \brief Write an unsigned 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value); + +/// \brief Write a 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value); + +/// \brief Write an unsigned 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value); + +/// \brief Write a 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value); + +/// \brief Write an unsigned 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value); + +/// \brief Write a 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value); + +/// \brief Write an unsigned 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value); + +/// \brief Write a double to a buffer +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value); + +/// \brief Write a float to a buffer +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value); + +/// \brief Write an ArrowStringView to a buffer +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value); + +/// \brief Write an ArrowBufferView to a buffer +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value); + +/// @} + +/// \defgroup nanoarrow-bitmap Bitmap utilities +/// +/// @{ + +/// \brief Extract a boolean value from a bitmap +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to true +static inline void ArrowBitSet(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to false +static inline void ArrowBitClear(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); + +/// \brief Set a boolean value to a range in a bitmap +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set); + +/// \brief Count true values in a bitmap +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); + +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out); + +/// \brief Extract int32 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out); + +/// \brief Initialize an ArrowBitmap +/// +/// Initialize the builder's buffer, empty its cache, and reset the size to zero +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); + +/// \brief Move an ArrowBitmap +/// +/// Transfers the underlying buffer data and lifecycle management to another +/// address and resets the bitmap. +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); + +/// \brief Ensure a bitmap builder has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bits, overallocating when required. +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits); + +/// \brief Grow or shrink a bitmap to a given capacity +/// +/// When shrinking the capacity of the bitmap, the bitmap is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not +/// adjust the buffer's size member except when shrinking new_capacity_bits +/// to a value less than the current number of bits in the bitmap. +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_capacity_bits, + char shrink_to_fit); + +/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append zero or more of the same boolean value to a bitmap +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append boolean values encoded as int8_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values); + +/// \brief Append boolean values encoded as int32_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values); + +/// \brief Reset a bitmap builder +/// +/// Releases any memory held by buffer, empties the cache, and resets the size to zero +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); + +/// @} + +/// \defgroup nanoarrow-array Creating arrays +/// +/// These functions allocate, copy, and destroy ArrowArray structures. +/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() +/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing +/// it using the embedded release callback. +/// +/// @{ + +/// \brief Initialize the fields of an array +/// +/// Initializes the fields and release callback of array. Caller +/// is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type); + +/// \brief Initialize the contents of an ArrowArray from an ArrowSchema +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error); + +/// \brief Allocate the array->children array +/// +/// Includes the memory for each child struct ArrowArray, +/// whose members are marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// schema must have been allocated using ArrowArrayInitFromType(). +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); + +/// \brief Allocate the array->dictionary member +/// +/// Includes the memory for the struct ArrowArray, whose contents +/// is marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); + +/// \brief Set the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); + +/// \brief Set a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer); + +/// \brief Get the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); + +/// \brief Get a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); + +/// \brief Start element-wise appending to an ArrowArray +/// +/// Initializes any values needed to use ArrowArrayAppend*() functions. +/// All element-wise appenders append by value and return EINVAL if the exact value +/// cannot be represented by the underlying storage type. +/// array must have been allocated using ArrowArrayInitFromType() +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); + +/// \brief Reserve space for future appends +/// +/// For buffer sizes that can be calculated (i.e., not string data buffers or +/// child array sizes for non-fixed-size arrays), recursively reserve space for +/// additional elements. This is useful for reducing the number of reallocations +/// that occur using the item-wise appenders. +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements); + +/// \brief Append a null value to an array +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); + +/// \brief Append an empty, non-null value to an array +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); + +/// \brief Append a signed integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); + +/// \brief Append an unsigned integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value); + +/// \brief Append a double value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range or there is an attempt to append +/// a non-integer to an array with an integer storage type). +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value); + +/// \brief Append a string of bytes to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// binary, string, large binary, large string, or fixed-size binary array, or value is +/// the wrong size for a fixed-size binary array). +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value); + +/// \brief Append a string value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// string or large string array). +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value); + +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + const struct ArrowInterval* value); + +/// \brief Append a decimal value to an array +/// +/// Returns NANOARROW_OK if array is a decimal array with the appropriate +/// bitwidth or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + const struct ArrowDecimal* value); + +/// \brief Finish a nested array element +/// +/// Appends a non-null element to the array based on the first child's current +/// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW +/// if the child of a list or map array would exceed INT_MAX elements, or EINVAL +/// if the underlying storage type is not a struct, list, large list, or fixed-size +/// list, or if there was an attempt to add a struct or fixed-size list element where the +/// length of the child array(s) did not match the expected length. +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); + +/// \brief Finish a union array element +/// +/// Appends an element to the union type ids buffer and increments array->length. +/// For sparse unions, up to one element is added to non type-id children. Returns +/// EINVAL if the underlying storage type is not a union, if type_id is not valid, +/// or if child sizes after appending are inconsistent. +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id); + +/// \brief Shrink buffer capacity to the size required +/// +/// Also applies shrinking to any child arrays. array must have been allocated using +/// ArrowArrayInitFromType +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); + +/// \brief Finish building an ArrowArray +/// +/// Flushes any pointers from internal buffers that may have been reallocated +/// into array->buffers and checks the actual size of the buffers +/// against the expected size based on the final length. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Finish building an ArrowArray with explicit validation +/// +/// Finish building with an explicit validation level. This could perform less validation +/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU +/// buffer data access is not possible or more validation (i.e., +/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or +/// corruptible source. +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-array-view Reading arrays +/// +/// These functions read and validate the contents ArrowArray structures. +/// +/// @{ + +/// \brief Initialize the contents of an ArrowArrayView +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type); + +/// \brief Move an ArrowArrayView +/// +/// Transfers the ArrowArrayView data and lifecycle management to another +/// address and resets the contents of src. +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst); + +/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Allocate the array_view->children array +/// +/// Includes the memory for each child struct ArrowArrayView +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children); + +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); + +/// \brief Set data-independent buffer sizes from length +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); + +/// \brief Set buffer sizes and data pointers from an ArrowArray +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Set buffer sizes and data pointers from an ArrowArray except for those +/// that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. See +/// documentation for ArrowValidationLevel for the details of checks performed +/// at each level. +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// \brief Reset the contents of an ArrowArrayView and frees resources +void ArrowArrayViewReset(struct ArrowArrayView* array_view); + +/// \brief Check for a null element in an ArrowArrayView +static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the type id of a union array element +static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the child index of a union array element +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the index to use into the relevant union child array +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for an int64. +static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an unsigned integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for a uint64. +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as a double +/// +/// This function does not check for null values, or +/// that values are within a valid range for a double. +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowStringView +/// +/// This function does not check for null values. +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowBufferView +/// +/// This function does not check for null values. +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowDecimal +/// +/// This function does not check for null values. The out parameter must +/// be initialized with ArrowDecimalInit() with the proper parameters for this +/// type before calling this for the first time. +static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out); + +/// @} + +/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation +/// +/// An implementation of an ArrowArrayStream based on a collection of +/// zero or more previously-existing ArrowArray objects. Users should +/// initialize and/or validate the contents before transferring the +/// responsibility of the ArrowArrayStream elsewhere. +/// +/// @{ + +/// \brief Initialize an ArrowArrayStream backed by this implementation +/// +/// This function moves the ownership of schema to the array_stream. If +/// this function returns NANOARROW_OK, the caller is responsible for +/// releasing the ArrowArrayStream. +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays); + +/// \brief Set the ith ArrowArray in this ArrowArrayStream. +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function move the ownership of array to the array_stream. i must +/// be greater than zero and less than the value of n_arrays passed in +/// ArrowBasicArrayStreamInit(). Callers are not required to fill all +/// n_arrays members (i.e., n_arrays is a maximum bound). +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array); + +/// \brief Validate the contents of this ArrowArrayStream +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() +/// to validate the contents of the arrays. +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, + struct ArrowError* error); + +/// @} + +// Undefine ArrowErrorCode, which may have been defined to annotate functions that return +// it to warn for an unused result. +#if defined(ArrowErrorCode) +#undef ArrowErrorCode +#endif + +// Inline function definitions + + + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED +#define NANOARROW_BUFFER_INLINE_H_INCLUDED + +#include +#include +#include + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { + int64_t doubled_capacity = current_capacity * 2; + if (doubled_capacity > new_capacity) { + return doubled_capacity; + } else { + return new_capacity; + } +} + +static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { + buffer->data = NULL; + buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->allocator = ArrowBufferAllocatorDefault(); +} + +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { + // This is not a perfect test for "has a buffer already been allocated" + // but is likely to catch most cases. + if (buffer->data == NULL) { + buffer->allocator = allocator; + return NANOARROW_OK; + } else { + return EINVAL; + } +} + +static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { + buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, + buffer->capacity_bytes); + ArrowBufferInit(buffer); +} + +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + memcpy(dst, src, sizeof(struct ArrowBuffer)); + src->data = NULL; + ArrowBufferInit(src); +} + +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit) { + if (new_capacity_bytes < 0) { + return EINVAL; + } + + if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { + buffer->data = buffer->allocator.reallocate( + &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_capacity_bytes; + } + + // Ensures that when shrinking that size <= capacity + if (new_capacity_bytes < buffer->size_bytes) { + buffer->size_bytes = new_capacity_bytes; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes) { + int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; + if (min_capacity_bytes <= buffer->capacity_bytes) { + return NANOARROW_OK; + } + + return ArrowBufferResize( + buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); +} + +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes) { + if (size_bytes > 0) { + memcpy(buffer->data + buffer->size_bytes, data, size_bytes); + buffer->size_bytes += size_bytes; + } +} + +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + ArrowBufferAppendUnsafe(buffer, data, size_bytes); + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value) { + return ArrowBufferAppend(buffer, &value, sizeof(double)); +} + +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value) { + return ArrowBufferAppend(buffer, &value, sizeof(float)); +} + +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value) { + return ArrowBufferAppend(buffer, value.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value) { + return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + memset(buffer->data + buffer->size_bytes, value, size_bytes); + buffer->size_bytes += size_bytes; + return NANOARROW_OK; +} + +static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; +static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; +static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; + +static const uint8_t _ArrowkBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, + 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, + 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, + 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, + 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { + return (value + 7) & ~((int64_t)7); +} + +static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { + return (value / 8) * 8; +} + +static inline int64_t _ArrowBytesForBits(int64_t bits) { + return (bits >> 3) + ((bits & 7) != 0); +} + +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +} + +static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +} + +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; +} + +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt32(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitSet(uint8_t* bits, int64_t i) { + bits[i / 8] |= _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitClear(uint8_t* bits, int64_t i) { + bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; +} + +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { + bits[i / 8] ^= + ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set) { + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const uint8_t fill_byte = (uint8_t)(-bits_are_set); + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_end = i_end / 8 + 1; + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) { + return; + } + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); +} + +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, + int64_t length) { + if (length == 0) { + return 0; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + // count bits within a single byte + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; + + const uint8_t only_byte_mask = + i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + + const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; + return _ArrowkBytePopcount[byte_masked]; + } + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; + int64_t count = 0; + + // first byte + count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + count += _ArrowkBytePopcount[bits[i]]; + } + + // last byte + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; + + return count; +} + +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { + ArrowBufferInit(&bitmap->buffer); + bitmap->size_bits = 0; +} + +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBufferMove(&src->buffer, &dst->buffer); + dst->size_bits = src->size_bits; + src->size_bits = 0; +} + +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits) { + int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; + if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { + return NANOARROW_OK; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); + + bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_capacity_bits, + char shrink_to_fit) { + if (new_capacity_bits < 0) { + return EINVAL; + } + + int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); + + if (new_capacity_bits < bitmap->size_bits) { + bitmap->size_bits = new_capacity_bits; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); + + ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); + return NANOARROW_OK; +} + +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); + bitmap->size_bits += length; + bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); +} + +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int8_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt8(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int32_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt32(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { + ArrowBufferReset(&bitmap->buffer); + bitmap->size_bits = 0; +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED +#define NANOARROW_ARRAY_INLINE_H_INCLUDED + +#include +#include +#include +#include +#include + + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + return &private_data->bitmap; +} + +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + switch (i) { + case 0: + return &private_data->bitmap.buffer; + default: + return private_data->buffers + i - 1; + } +} + +// We don't currently support the case of unions where type_id != child_index; +// however, these functions are used to keep track of where that assumption +// is made. +static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, + int8_t type_id) { + NANOARROW_UNUSED(array); + return type_id; +} + +static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, + int8_t child_index) { + NANOARROW_UNUSED(array); + return child_index; +} + +static inline int32_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { + if (*type_ids == '\0') { + return 0; + } + + int32_t i = 0; + long type_id; + char* end_ptr; + do { + type_id = strtol(type_ids, &end_ptr, 10); + if (end_ptr == type_ids || type_id < 0 || type_id > 127) { + return -1; + } + + if (out != NULL) { + out[i] = (int8_t)type_id; + } + + i++; + + type_ids = end_ptr; + if (*type_ids == '\0') { + return i; + } else if (*type_ids != ',') { + return -1; + } else { + type_ids++; + } + } while (1); + + return -1; +} + +static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, + int64_t n_type_ids, + int64_t n_children) { + if (n_type_ids != n_children) { + return 0; + } + + for (int8_t i = 0; i < n_type_ids; i++) { + if (type_ids[i] != i) { + return 0; + } + } + + return 1; +} + +static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, + int64_t n_children) { + int8_t type_ids[128]; + int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); +} + +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + return EINVAL; + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + // Note that this value could be -1 if the type_ids string was invalid + if (private_data->union_type_id_is_child_index != 1) { + return EINVAL; + } else { + break; + } + default: + break; + } + if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + // Initialize any data offset buffer with a single zero + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 64) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); + } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); + } + } + + // Start building any child arrays or dictionaries + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, + int64_t buffer_i, uint8_t value, + int64_t n) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + int64_t bytes_required = + _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * + (array->length + 1)) / + 8; + if (bytes_required > buffer->size_bytes) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); + } + + ArrowBitsSetTo(buffer->data, array->length, n, value); + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, + int64_t n, uint8_t is_valid) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + if (n == 0) { + return NANOARROW_OK; + } + + // Some type-specific handling + switch (private_data->storage_type) { + case NANOARROW_TYPE_NA: + // (An empty value for a null array *is* a null) + array->null_count += n; + array->length += n; + return NANOARROW_OK; + + case NANOARROW_TYPE_DENSE_UNION: { + // Add one null to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + for (int64_t i = 0; i < n; i++) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); + } + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_SPARSE_UNION: { + // Add n nulls to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); + for (int64_t i = 1; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( + array->children[0], n * private_data->layout.child_size_elements)); + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + break; + + default: + break; + } + + // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet + // and we need to append nulls, do it now. + if (!is_valid && private_data->bitmap.buffer.data == NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } else if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } + + // Add appropriate buffer fill + struct ArrowBuffer* buffer; + int64_t size_bytes; + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + buffer = ArrowArrayBuffer(array, i); + size_bytes = private_data->layout.element_size_bits[i] / 8; + + switch (private_data->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VALIDITY: + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Append the current value at the end of the offset buffer for each element + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); + + for (int64_t j = 0; j < n; j++) { + ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), + size_bytes); + } + + // Skip the data buffer + i++; + continue; + case NANOARROW_BUFFER_TYPE_DATA: + // Zero out the next bit of memory + if (private_data->layout.element_size_bits[i] % 8 == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); + } else { + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); + } + continue; + + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + // These cases return above + return EINVAL; + } + } + + array->length += n; + array->null_count += n * !is_valid; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 0); +} + +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 1); +} + +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, + int64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); + break; + case NANOARROW_TYPE_INT32: + _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); + break; + case NANOARROW_TYPE_INT16: + _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); + break; + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + return ArrowArrayAppendUInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UINT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); + break; + case NANOARROW_TYPE_UINT32: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); + break; + case NANOARROW_TYPE_UINT16: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); + break; + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); + break; + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); + return ArrowArrayAppendInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; + } + + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBufferView buffer_view; + buffer_view.data.data = value.data; + buffer_view.size_bytes = value.size_bytes; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return ArrowArrayAppendBytes(array, buffer_view); + default: + return EINVAL; + } +} + +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + const struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + const struct ArrowDecimal* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + if (value->n_words != 2) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); + break; + } + case NANOARROW_TYPE_DECIMAL256: + if (value->n_words != 4) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_length; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + child_length = array->children[0]->length; + if (child_length > INT32_MAX) { + return EOVERFLOW; + } + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); + break; + case NANOARROW_TYPE_LARGE_LIST: + child_length = array->children[0]->length; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_length = array->children[0]->length; + if (child_length != + ((array->length + 1) * private_data->layout.child_size_elements)) { + return EINVAL; + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + child_length = array->children[i]->length; + if (child_length != (array->length + 1)) { + return EINVAL; + } + } + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); + if (child_index < 0 || child_index >= array->n_children) { + return EINVAL; + } + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + // Append the target child length to the union offsets buffer + _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); + break; + case NANOARROW_TYPE_SPARSE_UNION: + // Append one empty to any non-target column that isn't already the right length + // or abort if appending a null will result in a column with invalid length + for (int64_t i = 0; i < array->n_children; i++) { + if (i == child_index || array->children[i]->length == (array->length + 1)) { + continue; + } + + if (array->children[i]->length != array->length) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); + } + + break; + default: + return EINVAL; + } + + // Write to the type_ids buffer + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); + array->length++; + return NANOARROW_OK; +} + +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayView)); + ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); +} + +static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, + int64_t i) { + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return 0x01; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0x00; + default: + return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); + } +} + +static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->buffer_views[0].data.as_int8[i]; + default: + return -1; + } +} + +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i) { + int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); + if (array_view->union_type_id_map == NULL) { + return type_id; + } else { + return array_view->union_type_id_map[type_id]; + } +} + +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_SPARSE_UNION: + return i; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewListChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, + int64_t i) { + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (int64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return INT64_MAX; + } +} + +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (uint64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return UINT64_MAX; + } +} + +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return (double)data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return (double)data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return DBL_MAX; + } +} + +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const char* data_view = array_view->buffer_views[2].data.as_char; + + struct ArrowStringView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.data = data_view + offsets_view->data.as_int32[i]; + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.data = data_view + offsets_view->data.as_int64[i]; + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); + break; + default: + view.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; + + struct ArrowBufferView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data.as_uint8 = + array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); + break; + default: + view.data.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline void ArrowArrayViewGetIntervalUnsafe( + const struct ArrowArrayView* array_view, int64_t i, struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out) { + i += array_view->offset; + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + ArrowDecimalSetBytes(out, data_view + (i * 16)); + break; + case NANOARROW_TYPE_DECIMAL256: + ArrowDecimalSetBytes(out, data_view + (i * 32)); + break; + default: + memset(out->words, 0, sizeof(out->words)); + break; + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libtiledbsoma/src/utils/nanoarrow.hpp b/libtiledbsoma/src/utils/nanoarrow.hpp new file mode 100644 index 0000000000..09a031511b --- /dev/null +++ b/libtiledbsoma/src/utils/nanoarrow.hpp @@ -0,0 +1,553 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "nanoarrow.h" + +#ifndef NANOARROW_HPP_INCLUDED +#define NANOARROW_HPP_INCLUDED + +/// \defgroup nanoarrow_hpp Nanoarrow C++ Helpers +/// +/// The utilities provided in this file are intended to support C++ users +/// of the nanoarrow C library such that C++-style resource allocation +/// and error handling can be used with nanoarrow data structures. +/// These utilities are not intended to mirror the nanoarrow C API. + +namespace nanoarrow { + +/// \defgroup nanoarrow_hpp-errors Error handling helpers +/// +/// Most functions in the C API return an ArrowErrorCode to communicate +/// possible failure. Except where documented, it is usually not safe to +/// continue after a non-zero value has been returned. While the +/// nanoarrow C++ helpers do not throw any exceptions of their own, +/// these helpers are provided to facilitate using the nanoarrow C++ helpers +/// in frameworks where this is a useful error handling idiom. +/// +/// @{ + +class Exception : public std::exception { + public: + Exception(const std::string& msg) : msg_(msg) {} + const char* what() const noexcept { return msg_.c_str(); } + + private: + std::string msg_; +}; + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + throw nanoarrow::Exception( \ + std::string(EXPR_STR) + std::string(" failed with errno ") + \ + std::to_string(NAME) + std::string("\n * ") + std::string(__FILE__) + \ + std::string(":") + std::to_string(__LINE__) + std::string("\n")); \ + } \ + } while (0) +#else +#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + throw nanoarrow::Exception(std::string(EXPR_STR) + \ + std::string(" failed with errno ") + \ + std::to_string(NAME)); \ + } \ + } while (0) +#endif + +#define NANOARROW_THROW_NOT_OK(EXPR) \ + _NANOARROW_THROW_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, \ + #EXPR) + +/// @} + +namespace internal { + +/// \defgroup nanoarrow_hpp-unique_base Base classes for Unique wrappers +/// +/// @{ + +template +static inline void init_pointer(T* data); + +template +static inline void move_pointer(T* src, T* dst); + +template +static inline void release_pointer(T* data); + +template <> +inline void init_pointer(struct ArrowSchema* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowSchema* src, struct ArrowSchema* dst) { + ArrowSchemaMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowSchema* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowArray* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowArray* src, struct ArrowArray* dst) { + ArrowArrayMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowArray* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowArrayStream* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { + ArrowArrayStreamMove(src, dst); +} + +template <> +inline void release_pointer(ArrowArrayStream* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowBuffer* data) { + ArrowBufferInit(data); +} + +template <> +inline void move_pointer(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + ArrowBufferMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowBuffer* data) { + ArrowBufferReset(data); +} + +template <> +inline void init_pointer(struct ArrowBitmap* data) { + ArrowBitmapInit(data); +} + +template <> +inline void move_pointer(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBitmapMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowBitmap* data) { + ArrowBitmapReset(data); +} + +template <> +inline void init_pointer(struct ArrowArrayView* data) { + ArrowArrayViewInitFromType(data, NANOARROW_TYPE_UNINITIALIZED); +} + +template <> +inline void move_pointer(struct ArrowArrayView* src, struct ArrowArrayView* dst) { + ArrowArrayViewMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowArrayView* data) { + ArrowArrayViewReset(data); +} + +/// \brief A unique_ptr-like base class for stack-allocatable objects +/// \tparam T The object type +template +class Unique { + public: + /// \brief Construct an invalid instance of T holding no resources + Unique() { init_pointer(&data_); } + + /// \brief Move and take ownership of data + Unique(T* data) { move_pointer(data, &data_); } + + /// \brief Move and take ownership of data wrapped by rhs + Unique(Unique&& rhs) : Unique(rhs.get()) {} + Unique& operator=(Unique&& rhs) { + reset(rhs.get()); + return *this; + } + + // These objects are not copyable + Unique(const Unique& rhs) = delete; + + /// \brief Get a pointer to the data owned by this object + T* get() noexcept { return &data_; } + const T* get() const noexcept { return &data_; } + + /// \brief Use the pointer operator to access fields of this object + T* operator->() noexcept { return &data_; } + const T* operator->() const noexcept { return &data_; } + + /// \brief Call data's release callback if valid + void reset() { release_pointer(&data_); } + + /// \brief Call data's release callback if valid and move ownership of the data + /// pointed to by data + void reset(T* data) { + reset(); + move_pointer(data, &data_); + } + + /// \brief Move ownership of this object to the data pointed to by out + void move(T* out) { move_pointer(&data_, out); } + + ~Unique() { reset(); } + + protected: + T data_; +}; + +template +static inline void DeallocateWrappedBuffer(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size) { + auto obj = reinterpret_cast(allocator->private_data); + delete obj; +} + +/// @} + +} // namespace internal + +/// \defgroup nanoarrow_hpp-unique Unique object wrappers +/// +/// The Arrow C Data interface, the Arrow C Stream interface, and the +/// nanoarrow C library use stack-allocatable objects, some of which +/// require initialization or cleanup. +/// +/// @{ + +/// \brief Class wrapping a unique struct ArrowSchema +using UniqueSchema = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArray +using UniqueArray = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArrayStream +using UniqueArrayStream = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowBuffer +using UniqueBuffer = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowBitmap +using UniqueBitmap = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArrayView +using UniqueArrayView = internal::Unique; + +/// @} + +/// \defgroup nanoarrow_hpp-buffer Buffer helpers +/// +/// Helpers to wrap buffer-like C++ objects as ArrowBuffer objects that can +/// be used to build ArrowArray objects. +/// +/// @{ + +/// \brief Initialize a buffer wrapping an arbitrary C++ object +/// +/// Initializes a buffer with a release callback that deletes the moved obj +/// when ArrowBufferReset is called. This version is useful for wrapping +/// an object whose .data() member is missing or unrelated to the buffer +/// value that is destined for a the buffer of an ArrowArray. T must be movable. +template +static inline void BufferInitWrapped(struct ArrowBuffer* buffer, T obj, + const uint8_t* data, int64_t size_bytes) { + T* obj_moved = new T(std::move(obj)); + buffer->data = const_cast(data); + buffer->size_bytes = size_bytes; + buffer->capacity_bytes = 0; + buffer->allocator = + ArrowBufferDeallocator(&internal::DeallocateWrappedBuffer, obj_moved); +} + +/// \brief Initialize a buffer wrapping a C++ sequence +/// +/// Specifically, this uses obj.data() to set the buffer address and +/// obj.size() * sizeof(T::value_type) to set the buffer size. This works +/// for STL containers like std::vector, std::array, and std::string. +/// This function moves obj and ensures it is deleted when ArrowBufferReset +/// is called. +template +void BufferInitSequence(struct ArrowBuffer* buffer, T obj) { + // Move before calling .data() (matters sometimes). + T* obj_moved = new T(std::move(obj)); + buffer->data = + const_cast(reinterpret_cast(obj_moved->data())); + buffer->size_bytes = obj_moved->size() * sizeof(typename T::value_type); + buffer->capacity_bytes = 0; + buffer->allocator = + ArrowBufferDeallocator(&internal::DeallocateWrappedBuffer, obj_moved); +} + +/// @} + +/// \defgroup nanoarrow_hpp-array-stream ArrayStream helpers +/// +/// These classes provide simple ArrowArrayStream implementations that +/// can be extended to help simplify the process of creating a valid +/// ArrowArrayStream implementation or used as-is for testing. +/// +/// @{ + +/// @brief Export an ArrowArrayStream from a standard C++ class +/// @tparam T A class with methods `int GetSchema(ArrowSchema*)`, `int +/// GetNext(ArrowArray*)`, and `const char* GetLastError()` +/// +/// This class allows a standard C++ class to be exported to a generic ArrowArrayStream +/// consumer by mapping C callback invocations to method calls on an instance of the +/// object whose lifecycle is owned by the ArrowArrayStream. See VectorArrayStream for +/// minimal useful example of this pattern. +/// +/// The methods must be accessible to the ArrayStreamFactory, either as public methods or +/// by declaring ArrayStreamFactory a friend. Implementors are encouraged (but +/// not required) to implement a ToArrayStream(ArrowArrayStream*) that creates a new +/// instance owned by the ArrowArrayStream and moves the relevant data to that instance. +/// +/// An example implementation might be: +/// +/// \code +/// class StreamImpl { +/// public: +/// // Public methods (e.g., constructor) used from C++ to initialize relevant data +/// +/// // Idiomatic exporter to move data + lifecycle responsibility to an instance +/// // managed by the ArrowArrayStream callbacks +/// void ToArrayStream(struct ArrowArrayStream* out) { +/// ArrayStreamFactory::InitArrayStream(new StreamImpl(...), out); +/// } +/// +/// private: +/// // Make relevant methods available to the ArrayStreamFactory +/// friend class ArrayStreamFactory; +/// +/// // Method implementations (called from C, not normally interacted with from C++) +/// int GetSchema(struct ArrowSchema* schema) { return ENOTSUP; } +/// int GetNext(struct ArrowArray* array) { return ENOTSUP; } +/// const char* GetLastError() { nullptr; } +/// }; +/// \endcode +/// +/// An example usage might be: +/// +/// \code +/// // Call constructor and/or public methods to initialize relevant data +/// StreamImpl impl; +/// +/// // Export to ArrowArrayStream after data are finalized +/// UniqueArrayStream stream; +/// impl.ToArrayStream(stream.get()); +/// \endcode +template +class ArrayStreamFactory { + public: + /// \brief Take ownership of instance and populate callbacks of out + static void InitArrayStream(T* instance, struct ArrowArrayStream* out) { + out->get_schema = &get_schema_wrapper; + out->get_next = &get_next_wrapper; + out->get_last_error = &get_last_error_wrapper; + out->release = &release_wrapper; + out->private_data = instance; + } + + private: + static int get_schema_wrapper(struct ArrowArrayStream* stream, + struct ArrowSchema* schema) { + return reinterpret_cast(stream->private_data)->GetSchema(schema); + } + + static int get_next_wrapper(struct ArrowArrayStream* stream, struct ArrowArray* array) { + return reinterpret_cast(stream->private_data)->GetNext(array); + } + + static const char* get_last_error_wrapper(struct ArrowArrayStream* stream) { + return reinterpret_cast(stream->private_data)->GetLastError(); + } + + static void release_wrapper(struct ArrowArrayStream* stream) { + delete reinterpret_cast(stream->private_data); + stream->release = nullptr; + stream->private_data = nullptr; + } +}; + +/// \brief An empty array stream +/// +/// This class can be constructed from an struct ArrowSchema and implements a default +/// get_next() method that always marks the output ArrowArray as released. +/// +/// DEPRECATED (0.4.0): Early versions of nanoarrow allowed subclasses to override +/// get_schema(), get_next(), and get_last_error(). This functionality will be removed +/// in a future release: use the pattern documented in ArrayStreamFactory to create +/// custom ArrowArrayStream implementations. +class EmptyArrayStream { + public: + /// \brief Create an EmptyArrayStream from an ArrowSchema + /// + /// Takes ownership of schema. + EmptyArrayStream(struct ArrowSchema* schema) : schema_(schema) { + ArrowErrorInit(&error_); + } + + /// \brief Export to ArrowArrayStream + void ToArrayStream(struct ArrowArrayStream* out) { + EmptyArrayStream* impl = new EmptyArrayStream(schema_.get()); + ArrayStreamFactory::InitArrayStream(impl, out); + } + + /// \brief Create an empty UniqueArrayStream from a struct ArrowSchema + /// + /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export an + /// EmptyArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema) { + UniqueArrayStream stream; + EmptyArrayStream(schema).ToArrayStream(stream.get()); + return stream; + } + + virtual ~EmptyArrayStream() {} + + protected: + UniqueSchema schema_; + struct ArrowError error_; + + void MakeStream(struct ArrowArrayStream* stream) { ToArrayStream(stream); } + + virtual int get_schema(struct ArrowSchema* schema) { + return ArrowSchemaDeepCopy(schema_.get(), schema); + } + + virtual int get_next(struct ArrowArray* array) { + array->release = nullptr; + return NANOARROW_OK; + } + + virtual const char* get_last_error() { return error_.message; } + + private: + friend class ArrayStreamFactory; + + int GetSchema(struct ArrowSchema* schema) { return get_schema(schema); } + + int GetNext(struct ArrowArray* array) { return get_next(array); } + + const char* GetLastError() { return get_last_error(); } +}; + +/// \brief Implementation of an ArrowArrayStream backed by a vector of UniqueArray objects +class VectorArrayStream { + public: + /// \brief Create a VectorArrayStream from an ArrowSchema + vector of UniqueArray + /// + /// Takes ownership of schema and moves arrays if possible. + VectorArrayStream(struct ArrowSchema* schema, std::vector arrays) + : offset_(0), schema_(schema), arrays_(std::move(arrays)) {} + + /// \brief Create a one-shot VectorArrayStream from an ArrowSchema + ArrowArray + /// + /// Takes ownership of schema and array. + VectorArrayStream(struct ArrowSchema* schema, struct ArrowArray* array) + : offset_(0), schema_(schema) { + arrays_.emplace_back(array); + } + + /// \brief Export to ArrowArrayStream + void ToArrayStream(struct ArrowArrayStream* out) { + VectorArrayStream* impl = new VectorArrayStream(schema_.get(), std::move(arrays_)); + ArrayStreamFactory::InitArrayStream(impl, out); + } + + /// \brief Create a UniqueArrowArrayStream from an existing array + /// + /// DEPRECATED (0.4.0): Use the constructors + ToArrayStream() to export a + /// VectorArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, + struct ArrowArray* array) { + UniqueArrayStream stream; + VectorArrayStream(schema, array).ToArrayStream(stream.get()); + return stream; + } + + /// \brief Create a UniqueArrowArrayStream from existing arrays + /// + /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export a + /// VectorArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, + std::vector arrays) { + UniqueArrayStream stream; + VectorArrayStream(schema, std::move(arrays)).ToArrayStream(stream.get()); + return stream; + } + + private: + int64_t offset_; + UniqueSchema schema_; + std::vector arrays_; + + friend class ArrayStreamFactory; + + int GetSchema(struct ArrowSchema* schema) { + return ArrowSchemaDeepCopy(schema_.get(), schema); + } + + int GetNext(struct ArrowArray* array) { + if (offset_ < static_cast(arrays_.size())) { + arrays_[offset_++].move(array); + } else { + array->release = nullptr; + } + + return NANOARROW_OK; + } + + const char* GetLastError() { return ""; } +}; + +/// @} + +} // namespace nanoarrow + +#endif From ea0adee89ce3a9b9ffc4d8ed5a529e9b7719848b Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 7 Mar 2024 10:01:25 -0600 Subject: [PATCH 05/39] Ensure nullable is set correctly in either case --- libtiledbsoma/src/utils/arrow_adapter.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 90e677b059..4fea01aea5 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -372,7 +372,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; + schema->flags |= ARROW_FLAG_NULLABLE; // turns out it is also set by default // Count nulls for (auto v : column->validity()) { @@ -382,7 +382,10 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { // Convert validity bytemap to a bitmap in place column->validity_to_bitmap(); array->buffers[0] = column->validity().data(); + } else { + schema->flags = 0; // because ArrowSchemaInitFromType leads to NULLABLE set } + if (column->is_ordered()) { schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; } From 7cbbb9d31a338f616f5fc1fafa5060542bd12704 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 7 Mar 2024 12:02:12 -0600 Subject: [PATCH 06/39] Context wrapped in a special purpose struct should not finalize --- apis/r/src/riterator.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index b4bfdcc0df..d803a6c040 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -3,6 +3,8 @@ #define TILEDB_NO_API_DEPRECATION_WARNINGS #endif +//#define RCPP_DEBUG_LEVEL 5 + #include // for R interface to C++ #include // for C interface to Arrow #include @@ -99,7 +101,7 @@ Rcpp::List sr_setup(const std::string& uri, std::shared_ptr ctxptr = std::make_shared(cfg); ctx_wrap_t* ctxwrap_p = new ContextWrapper(ctxptr); - Rcpp::XPtr ctx_wrap_xptr = make_xptr(ctxwrap_p); + Rcpp::XPtr ctx_wrap_xptr = make_xptr(ctxwrap_p, false); if (!colnames.isNull()) { column_names = Rcpp::as>(colnames); From b86ff8172e32eab4adaae7b440d37aeb50b5f6ff Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 7 Mar 2024 13:18:09 -0600 Subject: [PATCH 07/39] Simpler and faster r-ci.yaml --- .github/workflows/r-ci.yml | 12 ++++++------ libtiledbsoma/src/utils/arrow_adapter.cc | 14 +++----------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index a1cd3beb50..88c26eb811 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -79,13 +79,13 @@ jobs: # if: ${{ matrix.os != 'macOS-latest' }} # run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - - name: Install r-universe build of SeuratObject (macOS) - if: ${{ matrix.os == 'macOS-latest' }} - run: cd apis/r && Rscript -e "install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev', 'https://cloud.r-project.org'))" + #- name: Install r-universe build of SeuratObject (macOS) + # if: ${{ matrix.os == 'macOS-latest' }} + # run: cd apis/r && Rscript -e "install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev', 'https://cloud.r-project.org'))" - - name: Install r-universe build of SeuratObject (linux) - if: ${{ matrix.os == 'ubuntu-latest' }} - run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" + #- name: Install r-universe build of SeuratObject (linux) + # if: ${{ matrix.os == 'ubuntu-latest' }} + # run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - name: Dependencies run: cd apis/r && tools/r-ci.sh install_all diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4fea01aea5..58a5622ad0 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -83,18 +83,11 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { dict = nullptr; } } - LOG_TRACE("[ArrowAdapter] release_schema"); + LOG_TRACE("[ArrowAdapter] release_schema done"); } void ArrowAdapter::release_array(struct ArrowArray* array) { auto arrow_buffer = static_cast(array->private_data); - LOG_DEBUG(fmt::format("[ArrowAdapter] release_array for {} cnt {} var {} nullable {} enum {}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count(), - arrow_buffer->buffer_->is_var(), - arrow_buffer->buffer_->is_nullable(), - arrow_buffer->buffer_->has_enumeration() - )); LOG_TRACE(fmt::format( "[ArrowAdapter] release_array {} use_count={}", @@ -127,17 +120,16 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { struct ArrowArray* dict = array->dictionary; if (dict != nullptr) { if (dict->buffers != nullptr) { - free(dict->buffers); + //free(dict->buffers); dict->buffers = nullptr; } if (dict->release != nullptr) { - //delete dict; free(dict); dict = nullptr; } } array->release = nullptr; - + LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); } std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( From 9bc9f05db56487e5ac4852db9daf5ed36249e710 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 7 Mar 2024 15:00:19 -0600 Subject: [PATCH 08/39] Use nanoarrow 0.4.0 consistently --- apis/r/src/nanoarrow.c | 262 +++++++++++++- apis/r/src/nanoarrow.h | 42 +++ apis/r/src/nanoarrow.hpp | 501 ++++++++++++++++++++++++++ apis/r/src/rinterface.cpp | 2 +- libtiledbsoma/src/utils/nanoarrow.c | 46 +-- libtiledbsoma/src/utils/nanoarrow.h | 28 +- libtiledbsoma/src/utils/nanoarrow.hpp | 52 --- 7 files changed, 822 insertions(+), 111 deletions(-) create mode 100644 apis/r/src/nanoarrow.hpp diff --git a/apis/r/src/nanoarrow.c b/apis/r/src/nanoarrow.c index d9a8d7d905..c946c01362 100644 --- a/apis/r/src/nanoarrow.c +++ b/apis/r/src/nanoarrow.c @@ -231,6 +231,205 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( allocator.private_data = private_data; return allocator; } + +static const int kInt32DecimalDigits = 9; + +static const uint64_t kUInt32PowersOfTen[] = { + 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, + 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL}; + +// Adapted from Arrow C++ to use 32-bit words for better C portability +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544 +static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) { + // We use strtoll for parsing, which needs input that is null-terminated + char chunk_string[16]; + + for (int64_t posn = 0; posn < value.size_bytes;) { + int64_t remaining = value.size_bytes - posn; + + int64_t group_size; + if (remaining > kInt32DecimalDigits) { + group_size = kInt32DecimalDigits; + } else { + group_size = remaining; + } + + const uint64_t multiple = kUInt32PowersOfTen[group_size]; + + memcpy(chunk_string, value.data + posn, group_size); + chunk_string[group_size] = '\0'; + uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10); + + for (int64_t i = 0; i < out_size; i++) { + uint64_t tmp = out[i]; + tmp *= multiple; + tmp += chunk; + out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL); + chunk = (uint32_t)(tmp >> 32); + } + posn += group_size; + } +} + +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value) { + // Check for sign + int is_negative = value.data[0] == '-'; + int has_sign = is_negative || value.data[0] == '+'; + value.data += has_sign; + value.size_bytes -= has_sign; + + // Check all characters are digits that are not the negative sign + for (int64_t i = 0; i < value.size_bytes; i++) { + char c = value.data[i]; + if (c < '0' || c > '9') { + return EINVAL; + } + } + + // Skip over leading 0s + int64_t n_leading_zeroes = 0; + for (int64_t i = 0; i < value.size_bytes; i++) { + if (value.data[i] == '0') { + n_leading_zeroes++; + } else { + break; + } + } + + value.data += n_leading_zeroes; + value.size_bytes -= n_leading_zeroes; + + // Use 32-bit words for portability + uint32_t words32[8]; + int n_words32 = decimal->n_words * 2; + NANOARROW_DCHECK(n_words32 <= 8); + memset(words32, 0, sizeof(words32)); + + ShiftAndAdd(value, words32, n_words32); + + if (decimal->low_word_index == 0) { + memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32); + } else { + uint64_t lo; + uint64_t hi; + + for (int i = 0; i < decimal->n_words; i++) { + lo = (uint64_t)words32[i * 2]; + hi = (uint64_t)words32[i * 2 + 1] << 32; + decimal->words[decimal->n_words - i - 1] = lo | hi; + } + } + + if (is_negative) { + ArrowDecimalNegate(decimal); + } + + return NANOARROW_OK; +} + +// Adapted from Arrow C++ for C +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer) { + int is_negative = ArrowDecimalSign(decimal) < 0; + + uint64_t words_little_endian[4]; + if (decimal->low_word_index == 0) { + memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t)); + } else { + for (int i = 0; i < decimal->n_words; i++) { + words_little_endian[i] = decimal->words[decimal->n_words - i - 1]; + } + } + + // We've already made a copy, so negate that if needed + if (is_negative) { + uint64_t carry = 1; + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = words_little_endian[i]; + elem = ~elem + carry; + carry &= (elem == 0); + words_little_endian[i] = elem; + } + } + + // Find the most significant word that is non-zero + int most_significant_elem_idx = -1; + for (int i = decimal->n_words - 1; i >= 0; i--) { + if (words_little_endian[i] != 0) { + most_significant_elem_idx = i; + break; + } + } + + // If they are all zero, the output is just '0' + if (most_significant_elem_idx == -1) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0')); + return NANOARROW_OK; + } + + // Define segments such that each segment represents 9 digits with the + // least significant group of 9 digits first. For example, if the input represents + // 9876543210123456789, then segments will be [123456789, 876543210, 9]. + // We handle at most a signed 256 bit integer, whose maximum value occupies 77 + // characters. Thus, we need at most 9 segments. + const uint32_t k1e9 = 1000000000U; + int num_segments = 0; + uint32_t segments[9]; + memset(segments, 0, sizeof(segments)); + uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx; + + do { + // Compute remainder = words_little_endian % 1e9 and words_little_endian = + // words_little_endian / 1e9. + uint32_t remainder = 0; + uint64_t* elem = most_significant_elem; + + do { + // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); + // *elem = dividend / 1e9; + // remainder = dividend % 1e9. + uint32_t hi = (uint32_t)(*elem >> 32); + uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL); + uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi; + uint64_t quotient_hi = dividend_hi / k1e9; + remainder = (uint32_t)(dividend_hi % k1e9); + uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo; + uint64_t quotient_lo = dividend_lo / k1e9; + remainder = (uint32_t)(dividend_lo % k1e9); + + *elem = (quotient_hi << 32) | quotient_lo; + } while (elem-- != words_little_endian); + + segments[num_segments++] = remainder; + } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian); + + // We know our output has no more than 9 digits per segment, plus a negative sign, + // plus any further digits between our output of 9 digits plus enough + // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu + // including a the null terminator) is bounded properly. + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9)); + if (is_negative) { + buffer->data[buffer->size_bytes++] = '-'; + } + + // The most significant segment should have no leading zeroes + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", + (unsigned long)segments[num_segments - 1]); + buffer->size_bytes += n_chars; + + // Subsequent output needs to be left-padded with zeroes such that each segment + // takes up exactly 9 digits. + for (int i = num_segments - 2; i >= 0; i--) { + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu", + (unsigned long)segments[i]); + buffer->size_bytes += n_chars; + NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes); + } + + return NANOARROW_OK; +} // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -255,7 +454,8 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( #include "nanoarrow.h" -static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { +// -- changed for tiledb-r static +void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); if (schema->name != NULL) ArrowFree((void*)schema->name); if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); @@ -296,8 +496,7 @@ static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { schema->release = NULL; } -// -- changed for tiledb-r static -const char* ArrowSchemaFormatTemplate(enum ArrowType type) { +static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_UNINITIALIZED: return NULL; @@ -364,8 +563,7 @@ const char* ArrowSchemaFormatTemplate(enum ArrowType type) { } } -// -- changed for tiledb-r static -int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, +static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, enum ArrowType type) { switch (type) { case NANOARROW_TYPE_LIST: @@ -531,10 +729,33 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum Arrow int n_chars; switch (type) { case NANOARROW_TYPE_TIME32: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_MICRO: + case NANOARROW_TIME_UNIT_NANO: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; case NANOARROW_TYPE_TIME64: if (timezone != NULL) { return EINVAL; } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + case NANOARROW_TIME_UNIT_MILLI: + return EINVAL; + default: + break; + } + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIMESTAMP: @@ -1390,16 +1611,31 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, schema_view->type = NANOARROW_TYPE_DICTIONARY; } - result = ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error); - if (result != NANOARROW_OK) { - return result; - } + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); if (schema_view->storage_type != schema_view->type) { - result = ArrowSchemaViewValidate(schema_view, schema_view->type, error); - if (result != NANOARROW_OK) { - return result; - } + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->type, error)); + } + + int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; + if (unknown_flags != 0) { + ArrowErrorSet(error, "Unknown ArrowSchema flag"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && + schema_view->type != NANOARROW_TYPE_DICTIONARY) { + ArrowErrorSet(error, + "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && + schema_view->type != NANOARROW_TYPE_MAP) { + ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); + return EINVAL; } ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); diff --git a/apis/r/src/nanoarrow.h b/apis/r/src/nanoarrow.h index 331da29837..e338560f1a 100644 --- a/apis/r/src/nanoarrow.h +++ b/apis/r/src/nanoarrow.h @@ -241,6 +241,11 @@ typedef int ArrowErrorCode; #define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode #endif +/// \brief Flags supported by ArrowSchemaViewInit() +/// \ingroup nanoarrow-schema-view +#define NANOARROW_FLAG_ALL_SUPPORTED \ + (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED) + /// \brief Error type containing a UTF-8 encoded message. /// \ingroup nanoarrow-errors struct ArrowError { @@ -948,6 +953,28 @@ static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t valu decimal->words[decimal->low_word_index] = value; } +/// \brief Negate the value of this decimal in place +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { + uint64_t carry = 1; + + if (decimal->low_word_index == 0) { + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } else { + for (int i = decimal->low_word_index; i >= 0; i--) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } +} + /// \brief Copy bytes from a buffer into this decimal /// \ingroup nanoarrow-utils static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, @@ -1009,6 +1036,9 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) #define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) #define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) +#define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) +#define ArrowDecimalAppendDigitsToBuffer \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) #define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) #define ArrowSchemaInitFromType \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) @@ -1242,6 +1272,14 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); /// \brief Create a string view from a null-terminated string static inline struct ArrowStringView ArrowCharView(const char* value); +/// \brief Sets the integer value of an ArrowDecimal from a string +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value); + +/// \brief Get the integer value of an ArrowDecimal as string +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer); + /// @} /// \defgroup nanoarrow-schema Creating schemas @@ -3280,6 +3318,10 @@ static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, return EINVAL; } + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + array->length++; return NANOARROW_OK; } diff --git a/apis/r/src/nanoarrow.hpp b/apis/r/src/nanoarrow.hpp new file mode 100644 index 0000000000..8d5b841e28 --- /dev/null +++ b/apis/r/src/nanoarrow.hpp @@ -0,0 +1,501 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "nanoarrow.h" + +#ifndef NANOARROW_HPP_INCLUDED +#define NANOARROW_HPP_INCLUDED + +/// \defgroup nanoarrow_hpp Nanoarrow C++ Helpers +/// +/// The utilities provided in this file are intended to support C++ users +/// of the nanoarrow C library such that C++-style resource allocation +/// and error handling can be used with nanoarrow data structures. +/// These utilities are not intended to mirror the nanoarrow C API. + +namespace nanoarrow { + +/// \defgroup nanoarrow_hpp-errors Error handling helpers +/// +/// Most functions in the C API return an ArrowErrorCode to communicate +/// possible failure. Except where documented, it is usually not safe to +/// continue after a non-zero value has been returned. While the +/// nanoarrow C++ helpers do not throw any exceptions of their own, +/// these helpers are provided to facilitate using the nanoarrow C++ helpers +/// in frameworks where this is a useful error handling idiom. +/// +/// @{ + +class Exception : public std::exception { + public: + Exception(const std::string& msg) : msg_(msg) {} + const char* what() const noexcept { return msg_.c_str(); } + + private: + std::string msg_; +}; + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + throw nanoarrow::Exception( \ + std::string(EXPR_STR) + std::string(" failed with errno ") + \ + std::to_string(NAME) + std::string("\n * ") + std::string(__FILE__) + \ + std::string(":") + std::to_string(__LINE__) + std::string("\n")); \ + } \ + } while (0) +#else +#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + throw nanoarrow::Exception(std::string(EXPR_STR) + \ + std::string(" failed with errno ") + \ + std::to_string(NAME)); \ + } \ + } while (0) +#endif + +#define NANOARROW_THROW_NOT_OK(EXPR) \ + _NANOARROW_THROW_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, \ + #EXPR) + +/// @} + +namespace internal { + +/// \defgroup nanoarrow_hpp-unique_base Base classes for Unique wrappers +/// +/// @{ + +template +static inline void init_pointer(T* data); + +template +static inline void move_pointer(T* src, T* dst); + +template +static inline void release_pointer(T* data); + +template <> +inline void init_pointer(struct ArrowSchema* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowSchema* src, struct ArrowSchema* dst) { + ArrowSchemaMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowSchema* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowArray* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowArray* src, struct ArrowArray* dst) { + ArrowArrayMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowArray* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowArrayStream* data) { + data->release = nullptr; +} + +template <> +inline void move_pointer(struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { + ArrowArrayStreamMove(src, dst); +} + +template <> +inline void release_pointer(ArrowArrayStream* data) { + if (data->release != nullptr) { + data->release(data); + } +} + +template <> +inline void init_pointer(struct ArrowBuffer* data) { + ArrowBufferInit(data); +} + +template <> +inline void move_pointer(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + ArrowBufferMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowBuffer* data) { + ArrowBufferReset(data); +} + +template <> +inline void init_pointer(struct ArrowBitmap* data) { + ArrowBitmapInit(data); +} + +template <> +inline void move_pointer(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBitmapMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowBitmap* data) { + ArrowBitmapReset(data); +} + +template <> +inline void init_pointer(struct ArrowArrayView* data) { + ArrowArrayViewInitFromType(data, NANOARROW_TYPE_UNINITIALIZED); +} + +template <> +inline void move_pointer(struct ArrowArrayView* src, struct ArrowArrayView* dst) { + ArrowArrayViewMove(src, dst); +} + +template <> +inline void release_pointer(struct ArrowArrayView* data) { + ArrowArrayViewReset(data); +} + +/// \brief A unique_ptr-like base class for stack-allocatable objects +/// \tparam T The object type +template +class Unique { + public: + /// \brief Construct an invalid instance of T holding no resources + Unique() { init_pointer(&data_); } + + /// \brief Move and take ownership of data + Unique(T* data) { move_pointer(data, &data_); } + + /// \brief Move and take ownership of data wrapped by rhs + Unique(Unique&& rhs) : Unique(rhs.get()) {} + Unique& operator=(Unique&& rhs) { + reset(rhs.get()); + return *this; + } + + // These objects are not copyable + Unique(const Unique& rhs) = delete; + + /// \brief Get a pointer to the data owned by this object + T* get() noexcept { return &data_; } + const T* get() const noexcept { return &data_; } + + /// \brief Use the pointer operator to access fields of this object + T* operator->() noexcept { return &data_; } + const T* operator->() const noexcept { return &data_; } + + /// \brief Call data's release callback if valid + void reset() { release_pointer(&data_); } + + /// \brief Call data's release callback if valid and move ownership of the data + /// pointed to by data + void reset(T* data) { + reset(); + move_pointer(data, &data_); + } + + /// \brief Move ownership of this object to the data pointed to by out + void move(T* out) { move_pointer(&data_, out); } + + ~Unique() { reset(); } + + protected: + T data_; +}; + +/// @} + +} // namespace internal + +/// \defgroup nanoarrow_hpp-unique Unique object wrappers +/// +/// The Arrow C Data interface, the Arrow C Stream interface, and the +/// nanoarrow C library use stack-allocatable objects, some of which +/// require initialization or cleanup. +/// +/// @{ + +/// \brief Class wrapping a unique struct ArrowSchema +using UniqueSchema = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArray +using UniqueArray = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArrayStream +using UniqueArrayStream = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowBuffer +using UniqueBuffer = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowBitmap +using UniqueBitmap = internal::Unique; + +/// \brief Class wrapping a unique struct ArrowArrayView +using UniqueArrayView = internal::Unique; + +/// @} + +/// \defgroup nanoarrow_hpp-array-stream ArrayStream helpers +/// +/// These classes provide simple ArrowArrayStream implementations that +/// can be extended to help simplify the process of creating a valid +/// ArrowArrayStream implementation or used as-is for testing. +/// +/// @{ + +/// @brief Export an ArrowArrayStream from a standard C++ class +/// @tparam T A class with methods `int GetSchema(ArrowSchema*)`, `int +/// GetNext(ArrowArray*)`, and `const char* GetLastError()` +/// +/// This class allows a standard C++ class to be exported to a generic ArrowArrayStream +/// consumer by mapping C callback invocations to method calls on an instance of the +/// object whose lifecycle is owned by the ArrowArrayStream. See VectorArrayStream for +/// minimal useful example of this pattern. +/// +/// The methods must be accessible to the ArrayStreamFactory, either as public methods or +/// by declaring ArrayStreamFactory a friend. Implementors are encouraged (but +/// not required) to implement a ToArrayStream(ArrowArrayStream*) that creates a new +/// instance owned by the ArrowArrayStream and moves the relevant data to that instance. +/// +/// An example implementation might be: +/// +/// \code +/// class StreamImpl { +/// public: +/// // Public methods (e.g., constructor) used from C++ to initialize relevant data +/// +/// // Idiomatic exporter to move data + lifecycle responsibility to an instance +/// // managed by the ArrowArrayStream callbacks +/// void ToArrayStream(struct ArrowArrayStream* out) { +/// ArrayStreamFactory::InitArrayStream(new StreamImpl(...), out); +/// } +/// +/// private: +/// // Make relevant methods available to the ArrayStreamFactory +/// friend class ArrayStreamFactory; +/// +/// // Method implementations (called from C, not normally interacted with from C++) +/// int GetSchema(struct ArrowSchema* schema) { return ENOTSUP; } +/// int GetNext(struct ArrowArray* array) { return ENOTSUP; } +/// const char* GetLastError() { nullptr; } +/// }; +/// \endcode +/// +/// An example usage might be: +/// +/// \code +/// // Call constructor and/or public methods to initialize relevant data +/// StreamImpl impl; +/// +/// // Export to ArrowArrayStream after data are finalized +/// UniqueArrayStream stream; +/// impl.ToArrayStream(stream.get()); +/// \endcode +template +class ArrayStreamFactory { + public: + /// \brief Take ownership of instance and populate callbacks of out + static void InitArrayStream(T* instance, struct ArrowArrayStream* out) { + out->get_schema = &get_schema_wrapper; + out->get_next = &get_next_wrapper; + out->get_last_error = &get_last_error_wrapper; + out->release = &release_wrapper; + out->private_data = instance; + } + + private: + static int get_schema_wrapper(struct ArrowArrayStream* stream, + struct ArrowSchema* schema) { + return reinterpret_cast(stream->private_data)->GetSchema(schema); + } + + static int get_next_wrapper(struct ArrowArrayStream* stream, struct ArrowArray* array) { + return reinterpret_cast(stream->private_data)->GetNext(array); + } + + static const char* get_last_error_wrapper(struct ArrowArrayStream* stream) { + return reinterpret_cast(stream->private_data)->GetLastError(); + } + + static void release_wrapper(struct ArrowArrayStream* stream) { + delete reinterpret_cast(stream->private_data); + stream->release = nullptr; + stream->private_data = nullptr; + } +}; + +/// \brief An empty array stream +/// +/// This class can be constructed from an struct ArrowSchema and implements a default +/// get_next() method that always marks the output ArrowArray as released. +/// +/// DEPRECATED (0.4.0): Early versions of nanoarrow allowed subclasses to override +/// get_schema(), get_next(), and get_last_error(). This functionality will be removed +/// in a future release: use the pattern documented in ArrayStreamFactory to create +/// custom ArrowArrayStream implementations. +class EmptyArrayStream { + public: + /// \brief Create an EmptyArrayStream from an ArrowSchema + /// + /// Takes ownership of schema. + EmptyArrayStream(struct ArrowSchema* schema) : schema_(schema) { + ArrowErrorInit(&error_); + } + + /// \brief Export to ArrowArrayStream + void ToArrayStream(struct ArrowArrayStream* out) { + EmptyArrayStream* impl = new EmptyArrayStream(schema_.get()); + ArrayStreamFactory::InitArrayStream(impl, out); + } + + /// \brief Create an empty UniqueArrayStream from a struct ArrowSchema + /// + /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export an + /// EmptyArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema) { + UniqueArrayStream stream; + EmptyArrayStream(schema).ToArrayStream(stream.get()); + return stream; + } + + virtual ~EmptyArrayStream() {} + + protected: + UniqueSchema schema_; + struct ArrowError error_; + + void MakeStream(struct ArrowArrayStream* stream) { ToArrayStream(stream); } + + virtual int get_schema(struct ArrowSchema* schema) { + return ArrowSchemaDeepCopy(schema_.get(), schema); + } + + virtual int get_next(struct ArrowArray* array) { + array->release = nullptr; + return NANOARROW_OK; + } + + virtual const char* get_last_error() { return error_.message; } + + private: + friend class ArrayStreamFactory; + + int GetSchema(struct ArrowSchema* schema) { return get_schema(schema); } + + int GetNext(struct ArrowArray* array) { return get_next(array); } + + const char* GetLastError() { return get_last_error(); } +}; + +/// \brief Implementation of an ArrowArrayStream backed by a vector of UniqueArray objects +class VectorArrayStream { + public: + /// \brief Create a VectorArrayStream from an ArrowSchema + vector of UniqueArray + /// + /// Takes ownership of schema and moves arrays if possible. + VectorArrayStream(struct ArrowSchema* schema, std::vector arrays) + : offset_(0), schema_(schema), arrays_(std::move(arrays)) {} + + /// \brief Create a one-shot VectorArrayStream from an ArrowSchema + ArrowArray + /// + /// Takes ownership of schema and array. + VectorArrayStream(struct ArrowSchema* schema, struct ArrowArray* array) + : offset_(0), schema_(schema) { + arrays_.emplace_back(array); + } + + /// \brief Export to ArrowArrayStream + void ToArrayStream(struct ArrowArrayStream* out) { + VectorArrayStream* impl = new VectorArrayStream(schema_.get(), std::move(arrays_)); + ArrayStreamFactory::InitArrayStream(impl, out); + } + + /// \brief Create a UniqueArrowArrayStream from an existing array + /// + /// DEPRECATED (0.4.0): Use the constructors + ToArrayStream() to export a + /// VectorArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, + struct ArrowArray* array) { + UniqueArrayStream stream; + VectorArrayStream(schema, array).ToArrayStream(stream.get()); + return stream; + } + + /// \brief Create a UniqueArrowArrayStream from existing arrays + /// + /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export a + /// VectorArrayStream to an ArrowArrayStream consumer. + static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, + std::vector arrays) { + UniqueArrayStream stream; + VectorArrayStream(schema, std::move(arrays)).ToArrayStream(stream.get()); + return stream; + } + + private: + int64_t offset_; + UniqueSchema schema_; + std::vector arrays_; + + friend class ArrayStreamFactory; + + int GetSchema(struct ArrowSchema* schema) { + return ArrowSchemaDeepCopy(schema_.get(), schema); + } + + int GetNext(struct ArrowArray* array) { + if (offset_ < static_cast(arrays_.size())) { + arrays_[offset_++].move(array); + } else { + array->release = nullptr; + } + + return NANOARROW_OK; + } + + const char* GetLastError() { return ""; } +}; + +/// @} + +} // namespace nanoarrow + +#endif diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 666e31a06b..e8a8f949a9 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -1,6 +1,6 @@ #include // for R interface to C++ #include // for C interface to Arrow -#include // for C interface to Arrow +#include // for C/C++ interface to Arrow #include // for fromInteger64 // we currently get deprecation warnings by default which are noisy diff --git a/libtiledbsoma/src/utils/nanoarrow.c b/libtiledbsoma/src/utils/nanoarrow.c index d7925587f5..c946c01362 100644 --- a/libtiledbsoma/src/utils/nanoarrow.c +++ b/libtiledbsoma/src/utils/nanoarrow.c @@ -201,9 +201,7 @@ static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocato uint8_t* ptr, int64_t size) { NANOARROW_UNUSED(allocator); NANOARROW_UNUSED(size); - if (ptr != NULL) { - ArrowFree(ptr); - } + ArrowFree(ptr); } static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { @@ -213,24 +211,13 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { return ArrowBufferAllocatorMalloc; } -static uint8_t* ArrowBufferDeallocatorReallocate(struct ArrowBufferAllocator* allocator, - uint8_t* ptr, int64_t old_size, - int64_t new_size) { +static uint8_t* ArrowBufferAllocatorNeverReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(ptr); + NANOARROW_UNUSED(old_size); NANOARROW_UNUSED(new_size); - - // Attempting to reallocate a buffer with a custom deallocator is - // a programming error. In debug mode, crash here. -#if defined(NANOARROW_DEBUG) - NANOARROW_PRINT_AND_DIE(ENOMEM, - "It is an error to reallocate a buffer whose allocator is " - "ArrowBufferDeallocator()"); -#endif - - // In release mode, ensure the the deallocator is called exactly - // once using the pointer it was given and return NULL, which - // will trigger the caller to return ENOMEM. - allocator->free(allocator, ptr, old_size); - *allocator = ArrowBufferAllocatorDefault(); return NULL; } @@ -239,7 +226,7 @@ struct ArrowBufferAllocator ArrowBufferDeallocator( int64_t size), void* private_data) { struct ArrowBufferAllocator allocator; - allocator.reallocate = &ArrowBufferDeallocatorReallocate; + allocator.reallocate = &ArrowBufferAllocatorNeverReallocate; allocator.free = custom_free; allocator.private_data = private_data; return allocator; @@ -467,7 +454,8 @@ ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decim #include "nanoarrow.h" -static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { +// -- changed for tiledb-r static +void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); if (schema->name != NULL) ArrowFree((void*)schema->name); if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); @@ -2037,7 +2025,8 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, #include "nanoarrow.h" -static void ArrowArrayReleaseInternal(struct ArrowArray* array) { +// -- changed for tiledb-r static +void ArrowArrayReleaseInternal(struct ArrowArray* array) { // Release buffers held by this array struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; @@ -2080,7 +2069,8 @@ static void ArrowArrayReleaseInternal(struct ArrowArray* array) { array->release = NULL; } -static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, +// -- changed for tiledb-r static +ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: @@ -2919,10 +2909,6 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } - } else if (array_view->buffer_views[2].size_bytes == -1) { - // If the data buffer size is unknown and there are no bytes in the offset buffer, - // set the data buffer size to 0. - array_view->buffer_views[2].size_bytes = 0; } break; @@ -2949,10 +2935,6 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } - } else if (array_view->buffer_views[2].size_bytes == -1) { - // If the data buffer size is unknown and there are no bytes in the offset - // buffer, set the data buffer size to 0. - array_view->buffer_views[2].size_bytes = 0; } break; diff --git a/libtiledbsoma/src/utils/nanoarrow.h b/libtiledbsoma/src/utils/nanoarrow.h index 8d62ac64fd..e338560f1a 100644 --- a/libtiledbsoma/src/utils/nanoarrow.h +++ b/libtiledbsoma/src/utils/nanoarrow.h @@ -19,9 +19,9 @@ #define NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 5 +#define NANOARROW_VERSION_MINOR 4 #define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.5.0-SNAPSHOT" +#define NANOARROW_VERSION "0.4.0-SNAPSHOT" #define NANOARROW_VERSION_INT \ (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ @@ -721,9 +721,6 @@ struct ArrowBufferAllocator { void* private_data; }; -typedef void (*ArrowBufferDeallocatorCallback)(struct ArrowBufferAllocator* allocator, - uint8_t* ptr, int64_t size); - /// \brief An owning mutable view of a buffer /// \ingroup nanoarrow-buffer struct ArrowBuffer { @@ -1171,8 +1168,10 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); /// attach a custom deallocator to an ArrowBuffer. This may be used to /// avoid copying an existing buffer that was not allocated using the /// infrastructure provided here (e.g., by an R or Python object). -struct ArrowBufferAllocator ArrowBufferDeallocator(ArrowBufferDeallocatorCallback, - void* private_data); +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data); /// @} @@ -2196,8 +2195,6 @@ static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { static inline ArrowErrorCode ArrowBufferSetAllocator( struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { - // This is not a perfect test for "has a buffer already been allocated" - // but is likely to catch most cases. if (buffer->data == NULL) { buffer->allocator = allocator; return NANOARROW_OK; @@ -2207,15 +2204,20 @@ static inline ArrowErrorCode ArrowBufferSetAllocator( } static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { - buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, - buffer->capacity_bytes); - ArrowBufferInit(buffer); + if (buffer->data != NULL) { + buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, + buffer->capacity_bytes); + buffer->data = NULL; + } + + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; } static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { memcpy(dst, src, sizeof(struct ArrowBuffer)); src->data = NULL; - ArrowBufferInit(src); + ArrowBufferReset(src); } static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, diff --git a/libtiledbsoma/src/utils/nanoarrow.hpp b/libtiledbsoma/src/utils/nanoarrow.hpp index 09a031511b..8d5b841e28 100644 --- a/libtiledbsoma/src/utils/nanoarrow.hpp +++ b/libtiledbsoma/src/utils/nanoarrow.hpp @@ -241,13 +241,6 @@ class Unique { T data_; }; -template -static inline void DeallocateWrappedBuffer(struct ArrowBufferAllocator* allocator, - uint8_t* ptr, int64_t size) { - auto obj = reinterpret_cast(allocator->private_data); - delete obj; -} - /// @} } // namespace internal @@ -280,51 +273,6 @@ using UniqueArrayView = internal::Unique; /// @} -/// \defgroup nanoarrow_hpp-buffer Buffer helpers -/// -/// Helpers to wrap buffer-like C++ objects as ArrowBuffer objects that can -/// be used to build ArrowArray objects. -/// -/// @{ - -/// \brief Initialize a buffer wrapping an arbitrary C++ object -/// -/// Initializes a buffer with a release callback that deletes the moved obj -/// when ArrowBufferReset is called. This version is useful for wrapping -/// an object whose .data() member is missing or unrelated to the buffer -/// value that is destined for a the buffer of an ArrowArray. T must be movable. -template -static inline void BufferInitWrapped(struct ArrowBuffer* buffer, T obj, - const uint8_t* data, int64_t size_bytes) { - T* obj_moved = new T(std::move(obj)); - buffer->data = const_cast(data); - buffer->size_bytes = size_bytes; - buffer->capacity_bytes = 0; - buffer->allocator = - ArrowBufferDeallocator(&internal::DeallocateWrappedBuffer, obj_moved); -} - -/// \brief Initialize a buffer wrapping a C++ sequence -/// -/// Specifically, this uses obj.data() to set the buffer address and -/// obj.size() * sizeof(T::value_type) to set the buffer size. This works -/// for STL containers like std::vector, std::array, and std::string. -/// This function moves obj and ensures it is deleted when ArrowBufferReset -/// is called. -template -void BufferInitSequence(struct ArrowBuffer* buffer, T obj) { - // Move before calling .data() (matters sometimes). - T* obj_moved = new T(std::move(obj)); - buffer->data = - const_cast(reinterpret_cast(obj_moved->data())); - buffer->size_bytes = obj_moved->size() * sizeof(typename T::value_type); - buffer->capacity_bytes = 0; - buffer->allocator = - ArrowBufferDeallocator(&internal::DeallocateWrappedBuffer, obj_moved); -} - -/// @} - /// \defgroup nanoarrow_hpp-array-stream ArrayStream helpers /// /// These classes provide simple ArrowArrayStream implementations that From 4f5dbe182299a60aeb06b5e1d0825ae16a9cbfd7 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 8 Mar 2024 10:29:32 -0600 Subject: [PATCH 09/39] Refined arrow_adapter --- apis/r/tests/testthat/test-SCEOutgest.R | 2 + libtiledbsoma/src/utils/arrow_adapter.cc | 98 ++++++++++++------------ 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/apis/r/tests/testthat/test-SCEOutgest.R b/apis/r/tests/testthat/test-SCEOutgest.R index 24c44b880d..eb9658e5de 100644 --- a/apis/r/tests/testthat/test-SCEOutgest.R +++ b/apis/r/tests/testthat/test-SCEOutgest.R @@ -1,4 +1,5 @@ test_that("Load SCE object from ExperimentQuery mechanics", { + if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("trace") skip_if(!extended_tests() || covr_tests()) skip_if_not_installed('SingleCellExperiment', .MINIMUM_SCE_VERSION('c')) uri <- withr::local_tempdir("sce-experiment-query-whole") @@ -358,4 +359,5 @@ test_that("Load SCE object from indexed ExperimentQuery", { ) expect_identical(SingleCellExperiment::colPairNames(obj), 'connectivities') expect_identical(SingleCellExperiment::rowPairNames(obj), 'network') + if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("warn") }) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 58a5622ad0..59d9f32b3a 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -40,55 +40,55 @@ using namespace tiledb; void ArrowAdapter::release_schema(struct ArrowSchema* schema) { LOG_DEBUG(fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); - schema->release = nullptr; if (schema->name != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->name"); free((void*)schema->name); schema->name = nullptr; } if (schema->format != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->format"); free((void*)schema->format); schema->format = nullptr; } - for (int i = 0; i < schema->n_children; ++i) { - struct ArrowSchema* child = schema->children[i]; - if (child->name != nullptr) { - free((void*)child->name); - child->name = nullptr; - } - if (child->format != nullptr) { - free((void*)child->format); - child->format = nullptr; - } - if (child->release != NULL) { - child->release(child); - } - free(child); + if (schema->metadata != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->metadata"); + free((void*)schema->metadata); + schema->metadata = nullptr; } - free(schema->children); - struct ArrowSchema* dict = schema->dictionary; - if (dict != nullptr) { - if (dict->name != nullptr) { - free((void*)dict->name); - dict->name = nullptr; - } - if (dict->format != nullptr) { - free((void*)dict->format); - dict->format = nullptr; + if (schema->children != nullptr) { + for (auto i = 0; i < schema->n_children; i++) { + if (schema->children[i] != nullptr) { + if (schema->children[i]->release != nullptr) { + LOG_TRACE(fmt::format("[ArrowAdapter] release_schema schema->child {} release",i)); + release_schema(schema->children[i]); + } + LOG_TRACE(fmt::format("[ArrowAdapter] release_schema schema->child {} free",i)); + free(schema->children[i]); + } } - if (dict->release != nullptr) { - //delete dict; - free(dict); - dict = nullptr; + LOG_TRACE("[ArrowAdapter] release_schema schema->children"); + free(schema->children); + schema->children = nullptr; + } + + if (schema->dictionary != nullptr) { + if (schema->dictionary->release != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->dict release"); + release_schema(schema->dictionary); } + LOG_TRACE("[ArrowAdapter] release_schema schema->dict free"); + free(schema->dictionary); + schema->dictionary = nullptr; } + + schema->release = nullptr; LOG_TRACE("[ArrowAdapter] release_schema done"); } void ArrowAdapter::release_array(struct ArrowArray* array) { auto arrow_buffer = static_cast(array->private_data); - LOG_TRACE(fmt::format( "[ArrowAdapter] release_array {} use_count={}", arrow_buffer->buffer_->name(), @@ -104,30 +104,34 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { array->buffers = nullptr; } - if (array->n_children > 0) { - for (int i = 0; i < array->n_children; ++i) { - struct ArrowArray* child = array->children[i]; - if (child != nullptr) { - release_array(child); - free(child); - child = nullptr; + if (array->children != nullptr) { + for (auto i = 0; i < array->n_children; i++) { + if (array->children[i] != nullptr) { + if (array->children[i]->release != nullptr) { + LOG_TRACE(fmt::format("[ArrowAdapter] release_schema array->child {} release",i)); + release_array(array->children[i]); + } + LOG_TRACE(fmt::format("[ArrowAdapter] release_schema array->child {} free",i)); + free(array->children[i]); } } + LOG_TRACE("[ArrowAdapter] release_array array->children"); free(array->children); array->children = nullptr; } - struct ArrowArray* dict = array->dictionary; - if (dict != nullptr) { - if (dict->buffers != nullptr) { - //free(dict->buffers); - dict->buffers = nullptr; - } - if (dict->release != nullptr) { - free(dict); - dict = nullptr; - } + if (array->dictionary != nullptr) { + // -- TODO: This can lead to segfault on some data sets and could be cause + // by how we fill arrow data structures. This should pass. + //if (array->dictionary->release != nullptr) { + // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); + // release_array(array->dictionary); + //} + LOG_TRACE("[ArrowAdapter] release_array array->dict free"); + free(array->dictionary); + array->dictionary = nullptr; } + array->release = nullptr; LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); } From ffb8cd332c1e73cf8f7d032887325194531c63f7 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 8 Mar 2024 10:38:47 -0600 Subject: [PATCH 10/39] Set increased timeout for download.file to survive GH flakyness --- apis/r/tools/r-ci.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/r/tools/r-ci.sh b/apis/r/tools/r-ci.sh index 866ce4031a..b8e9ad4ca2 100755 --- a/apis/r/tools/r-ci.sh +++ b/apis/r/tools/r-ci.sh @@ -354,12 +354,12 @@ InstallDeps() { } InstallDepsAndSuggests() { - sudo Rscript -e 'remotes::install_deps(".", dependencies=TRUE)' + sudo Rscript -e 'options(timeout = max(300, getOption("timeout"))); remotes::install_deps(".", dependencies=TRUE)' } DumpSysinfo() { echo "Dumping system information." - R -e '.libPaths(); sessionInfo(); installed.packages()' + Rscript -e '.libPaths(); sessionInfo(); installed.packages()' } DumpLogsByExtension() { From 4454f821ce3bdb7e32a108a847378d54c416d642 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 8 Mar 2024 11:57:54 -0600 Subject: [PATCH 11/39] Turn trace back of, do not include carrow in cli --- apis/r/tests/testthat/test-SCEOutgest.R | 4 ++-- libtiledbsoma/src/cli/cli.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apis/r/tests/testthat/test-SCEOutgest.R b/apis/r/tests/testthat/test-SCEOutgest.R index eb9658e5de..4dbd6838d3 100644 --- a/apis/r/tests/testthat/test-SCEOutgest.R +++ b/apis/r/tests/testthat/test-SCEOutgest.R @@ -1,5 +1,5 @@ test_that("Load SCE object from ExperimentQuery mechanics", { - if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("trace") + #if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("trace") skip_if(!extended_tests() || covr_tests()) skip_if_not_installed('SingleCellExperiment', .MINIMUM_SCE_VERSION('c')) uri <- withr::local_tempdir("sce-experiment-query-whole") @@ -359,5 +359,5 @@ test_that("Load SCE object from indexed ExperimentQuery", { ) expect_identical(SingleCellExperiment::colPairNames(obj), 'connectivities') expect_identical(SingleCellExperiment::rowPairNames(obj), 'network') - if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("warn") + #if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("warn") }) diff --git a/libtiledbsoma/src/cli/cli.cc b/libtiledbsoma/src/cli/cli.cc index b0eabc788c..20300508cd 100644 --- a/libtiledbsoma/src/cli/cli.cc +++ b/libtiledbsoma/src/cli/cli.cc @@ -33,7 +33,7 @@ #include "soma/enums.h" #include "soma/soma_array.h" #include "utils/arrow_adapter.h" -#include "utils/carrow.h" +//#include "utils/carrow.h" #include "utils/logger.h" using namespace tiledbsoma; From 1986c458c28f69112e7c9f6d5f4c12cf60ace9fa Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 8 Mar 2024 15:00:53 -0600 Subject: [PATCH 12/39] Do not include carrow.h in reindexer.cc --- apis/python/src/tiledbsoma/reindexer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/reindexer.cc b/apis/python/src/tiledbsoma/reindexer.cc index 025325a73e..7004c18b97 100644 --- a/apis/python/src/tiledbsoma/reindexer.cc +++ b/apis/python/src/tiledbsoma/reindexer.cc @@ -31,7 +31,7 @@ */ #include -#include +//#include #include "common.h" #define DENUM(x) .value(#x, TILEDB_##x) From 12678331cebfe9f135489ced31e11534c699de47 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 18 Mar 2024 07:02:17 -0500 Subject: [PATCH 13/39] WIP changes expanding type map, suppressing schema release --- libtiledbsoma/src/utils/arrow_adapter.cc | 33 +++++++++++++----------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 59d9f32b3a..6aeeadf792 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -39,7 +39,7 @@ namespace tiledbsoma { using namespace tiledb; void ArrowAdapter::release_schema(struct ArrowSchema* schema) { - LOG_DEBUG(fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); + LOG_DEBUG("[ArrowAdapter] release_schema"); if (schema->name != nullptr) { LOG_TRACE("[ArrowAdapter] release_schema schema->name"); @@ -48,7 +48,7 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { } if (schema->format != nullptr) { LOG_TRACE("[ArrowAdapter] release_schema schema->format"); - free((void*)schema->format); + //free((void*)schema->format); schema->format = nullptr; } if (schema->metadata != nullptr) { @@ -520,19 +520,22 @@ std::string_view ArrowAdapter::to_arrow_format( // FIXME: Add more types, maybe make it a map enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { - if (sv == "i") return NANOARROW_TYPE_INT32; - else if (sv == "c") return NANOARROW_TYPE_INT8; - else if (sv == "C") return NANOARROW_TYPE_UINT8; - else if (sv == "s") return NANOARROW_TYPE_INT16; - else if (sv == "S") return NANOARROW_TYPE_UINT16; - else if (sv == "I") return NANOARROW_TYPE_UINT32; - else if (sv == "l") return NANOARROW_TYPE_INT64; - else if (sv == "L") return NANOARROW_TYPE_UINT64; - else if (sv == "f") return NANOARROW_TYPE_FLOAT; - else if (sv == "g") return NANOARROW_TYPE_DOUBLE; - else if (sv == "u") return NANOARROW_TYPE_STRING; - else if (sv == "U") return NANOARROW_TYPE_LARGE_STRING; - else if (sv == "b") return NANOARROW_TYPE_BOOL; + if (sv == "i") return NANOARROW_TYPE_INT32; + else if (sv == "c") return NANOARROW_TYPE_INT8; + else if (sv == "C") return NANOARROW_TYPE_UINT8; + else if (sv == "s") return NANOARROW_TYPE_INT16; + else if (sv == "S") return NANOARROW_TYPE_UINT16; + else if (sv == "I") return NANOARROW_TYPE_UINT32; + else if (sv == "l") return NANOARROW_TYPE_INT64; + else if (sv == "L") return NANOARROW_TYPE_UINT64; + else if (sv == "f") return NANOARROW_TYPE_FLOAT; + else if (sv == "g") return NANOARROW_TYPE_DOUBLE; + else if (sv == "u") return NANOARROW_TYPE_STRING; + else if (sv == "U") return NANOARROW_TYPE_LARGE_STRING; + else if (sv == "b") return NANOARROW_TYPE_BOOL; + else if (sv == "tss:") return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "z") return NANOARROW_TYPE_BINARY; + else if (sv == "Z") return NANOARROW_TYPE_LARGE_BINARY; else throw TileDBSOMAError(fmt::format( "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); } From 3a4add731da10a04a9063b62eabad51f2727dccc Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 15 Mar 2024 16:15:51 -0500 Subject: [PATCH 14/39] [c++] Fix segfault issues --- apis/python/src/tiledbsoma/reindexer.cc | 2 +- libtiledbsoma/src/cli/cli.cc | 2 +- libtiledbsoma/src/utils/arrow_adapter.cc | 159 +- libtiledbsoma/src/utils/arrow_adapter.h | 6 +- libtiledbsoma/src/utils/nanoarrow.h | 4590 +++++++++++----------- 5 files changed, 2504 insertions(+), 2255 deletions(-) diff --git a/apis/python/src/tiledbsoma/reindexer.cc b/apis/python/src/tiledbsoma/reindexer.cc index 7004c18b97..bbfa035658 100644 --- a/apis/python/src/tiledbsoma/reindexer.cc +++ b/apis/python/src/tiledbsoma/reindexer.cc @@ -31,7 +31,7 @@ */ #include -//#include +// #include #include "common.h" #define DENUM(x) .value(#x, TILEDB_##x) diff --git a/libtiledbsoma/src/cli/cli.cc b/libtiledbsoma/src/cli/cli.cc index 20300508cd..cd69c8096f 100644 --- a/libtiledbsoma/src/cli/cli.cc +++ b/libtiledbsoma/src/cli/cli.cc @@ -33,7 +33,7 @@ #include "soma/enums.h" #include "soma/soma_array.h" #include "utils/arrow_adapter.h" -//#include "utils/carrow.h" +// #include "utils/carrow.h" #include "utils/logger.h" using namespace tiledbsoma; diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 6aeeadf792..b0b44b4649 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -40,6 +40,9 @@ using namespace tiledb; void ArrowAdapter::release_schema(struct ArrowSchema* schema) { LOG_DEBUG("[ArrowAdapter] release_schema"); + if (schema->name != nullptr) + LOG_DEBUG( + fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); if (schema->name != nullptr) { LOG_TRACE("[ArrowAdapter] release_schema schema->name"); @@ -61,10 +64,14 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { for (auto i = 0; i < schema->n_children; i++) { if (schema->children[i] != nullptr) { if (schema->children[i]->release != nullptr) { - LOG_TRACE(fmt::format("[ArrowAdapter] release_schema schema->child {} release",i)); + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} " + "release", + i)); release_schema(schema->children[i]); } - LOG_TRACE(fmt::format("[ArrowAdapter] release_schema schema->child {} free",i)); + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} free", i)); free(schema->children[i]); } } @@ -108,10 +115,13 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { for (auto i = 0; i < array->n_children; i++) { if (array->children[i] != nullptr) { if (array->children[i]->release != nullptr) { - LOG_TRACE(fmt::format("[ArrowAdapter] release_schema array->child {} release",i)); + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} release", + i)); release_array(array->children[i]); } - LOG_TRACE(fmt::format("[ArrowAdapter] release_schema array->child {} free",i)); + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} free", i)); free(array->children[i]); } } @@ -121,9 +131,10 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { } if (array->dictionary != nullptr) { - // -- TODO: This can lead to segfault on some data sets and could be cause + // -- TODO: This can lead to segfault on some data sets and could be + // cause // by how we fill arrow data structures. This should pass. - //if (array->dictionary->release != nullptr) { + // if (array->dictionary->release != nullptr) { // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); // release_array(array->dictionary); //} @@ -143,17 +154,21 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( auto nattr = tiledb_schema.attribute_num(); std::unique_ptr arrow_schema = std::make_unique(); - arrow_schema->format = "+s"; + arrow_schema->format = strdup("+s"); arrow_schema->n_children = ndim + nattr; arrow_schema->release = &ArrowAdapter::release_schema; - arrow_schema->children = (ArrowSchema**) malloc(arrow_schema->n_children * sizeof(ArrowSchema*)); //new ArrowSchema*[arrow_schema->n_children]; + arrow_schema->children = (ArrowSchema**)malloc( + arrow_schema->n_children * + sizeof(ArrowSchema*)); // new ArrowSchema*[arrow_schema->n_children]; ArrowSchema* child = nullptr; for (uint32_t i = 0; i < ndim; ++i) { auto dim = tiledb_schema.domain().dimension(i); - child = arrow_schema->children[i] = (ArrowSchema*) malloc(sizeof(ArrowSchema)); //new ArrowSchema; - child->format = ArrowAdapter::to_arrow_format(dim.type()).data(); + child = arrow_schema->children[i] = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); // new ArrowSchema; + child->format = strdup( + ArrowAdapter::to_arrow_format(dim.type()).data()); child->name = strdup(dim.name().c_str()); child->metadata = nullptr; child->flags = 0; @@ -165,8 +180,10 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( for (uint32_t i = 0; i < nattr; ++i) { auto attr = tiledb_schema.attribute(i); - child = arrow_schema->children[ndim + i] = (ArrowSchema*) malloc(sizeof(ArrowSchema)); //new ArrowSchema; - child->format = ArrowAdapter::to_arrow_format(attr.type()).data(); + child = arrow_schema->children[ndim + i] = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); // new ArrowSchema; + child->format = strdup( + ArrowAdapter::to_arrow_format(attr.type()).data()); child->name = strdup(attr.name().c_str()); child->metadata = nullptr; child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; @@ -179,7 +196,9 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( if (enmr_name.has_value()) { auto enmr = ArrayExperimental::get_enumeration( *ctx, *tiledb_array, attr.name()); - auto dict = new ArrowSchema; + + auto dict = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); // new ArrowSchema; if (enmr.type() == TILEDB_STRING_ASCII or enmr.type() == TILEDB_CHAR) { dict->format = strdup("z"); @@ -219,7 +238,7 @@ std::pair ArrowAdapter::_get_data_and_length( // Allocate a single byte to copy the bits into size_t sz = 1; - dst = malloc(sz); //new const void*[sz]; + dst = malloc(sz); // new const void*[sz]; std::memcpy((void*)dst, &src, sz); return std::pair(dst, data.size()); @@ -290,8 +309,8 @@ bool ArrowAdapter::_isstr(const char* format) { inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { if (ec != NANOARROW_OK) - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Arrow Error {} ", msg)); + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Arrow Error {} ", msg)); } std::pair, std::unique_ptr> @@ -304,8 +323,10 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { auto coltype = to_arrow_format(column->type()).data(); auto natype = to_nanoarrow_type(coltype); exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); - exitIfError(ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); - exitIfError(ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); + exitIfError( + ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); + exitIfError( + ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); #if 0 schema->format = to_arrow_format(column->type()).data(); @@ -319,7 +340,9 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { schema->release = &release_schema; schema->private_data = nullptr; - int n_buffers = column->is_var() ? 3 : 2; // this will be 2 for enumerations and 3 for char vectors + int n_buffers = column->is_var() ? 3 : + 2; // this will be 2 for enumerations + // and 3 for char vectors // Create an ArrowBuffer to manage the lifetime of `column`. // - `arrow_buffer` holds a shared_ptr to `column`, which @@ -337,10 +360,13 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); array->length = column->size(); - LOG_DEBUG(fmt::format("[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", - to_arrow_format(column->type()).data(), - column->name().data(), n_buffers, array->n_buffers, column->is_nullable())); - + LOG_DEBUG(fmt::format( + "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", + to_arrow_format(column->type()).data(), + column->name().data(), + n_buffers, + array->n_buffers, + column->is_nullable())); #if 0 array->null_count = 0; @@ -359,16 +385,18 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->name(), column.use_count())); - array->buffers = (const void**) malloc(sizeof(void*) * n_buffers); //new const void*[n_buffers]; + array->buffers = (const void**)malloc( + sizeof(void*) * n_buffers); // new const void*[n_buffers]; assert(array->buffers != nullptr); - array->buffers[0] = nullptr; // validity addressed below + array->buffers[0] = nullptr; // validity addressed below array->buffers[n_buffers - 1] = column->data().data(); // data if (n_buffers == 3) { array->buffers[1] = column->offsets().data(); // offsets } if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; // turns out it is also set by default + schema->flags |= ARROW_FLAG_NULLABLE; // turns out it is also set by + // default // Count nulls for (auto v : column->validity()) { @@ -379,7 +407,8 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->validity_to_bitmap(); array->buffers[0] = column->validity().data(); } else { - schema->flags = 0; // because ArrowSchemaInitFromType leads to NULLABLE set + schema->flags = 0; // because ArrowSchemaInitFromType leads to NULLABLE + // set } if (column->is_ordered()) { @@ -392,16 +421,21 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } if (column->has_enumeration()) { - auto dict_sch = (ArrowSchema*) malloc(sizeof(ArrowSchema)); //new ArrowSchema; - auto dict_arr = (ArrowArray*) malloc(sizeof(ArrowArray)); //new ArrowArray; + auto dict_sch = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); // new ArrowSchema; + auto dict_arr = (ArrowArray*)malloc( + sizeof(ArrowArray)); // new ArrowArray; auto enmr = column->get_enumeration_info(); auto dcoltype = to_arrow_format(enmr->type(), false).data(); auto dnatype = to_nanoarrow_type(dcoltype); - exitIfError(ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); + exitIfError( + ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); - exitIfError(ArrowSchemaAllocateChildren(dict_sch, 0), "Bad schema children alloc"); + exitIfError( + ArrowSchemaAllocateChildren(dict_sch, 0), + "Bad schema children alloc"); #if 0 dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); dict_sch->name = nullptr; @@ -414,11 +448,11 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_sch->private_data = nullptr; #endif - exitIfError(ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); - exitIfError(ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); - const int n_buf = ArrowAdapter::_isstr(dict_sch->format) ? 3 : 2; - dict_arr->buffers = (const void**) malloc(sizeof(void*) * n_buf); //new const void*[n_buf]; - dict_arr->buffers[0] = nullptr; // validity: none here + exitIfError( + ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); + exitIfError( + ArrowArrayAllocateChildren(dict_arr, 0), + "Bad array children alloc"); dict_arr->release = &release_array; #if 0 dict_arr->null_count = 0; @@ -520,24 +554,41 @@ std::string_view ArrowAdapter::to_arrow_format( // FIXME: Add more types, maybe make it a map enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { - if (sv == "i") return NANOARROW_TYPE_INT32; - else if (sv == "c") return NANOARROW_TYPE_INT8; - else if (sv == "C") return NANOARROW_TYPE_UINT8; - else if (sv == "s") return NANOARROW_TYPE_INT16; - else if (sv == "S") return NANOARROW_TYPE_UINT16; - else if (sv == "I") return NANOARROW_TYPE_UINT32; - else if (sv == "l") return NANOARROW_TYPE_INT64; - else if (sv == "L") return NANOARROW_TYPE_UINT64; - else if (sv == "f") return NANOARROW_TYPE_FLOAT; - else if (sv == "g") return NANOARROW_TYPE_DOUBLE; - else if (sv == "u") return NANOARROW_TYPE_STRING; - else if (sv == "U") return NANOARROW_TYPE_LARGE_STRING; - else if (sv == "b") return NANOARROW_TYPE_BOOL; - else if (sv == "tss:") return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "z") return NANOARROW_TYPE_BINARY; - else if (sv == "Z") return NANOARROW_TYPE_LARGE_BINARY; - else throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); + if (sv == "i") + return NANOARROW_TYPE_INT32; + else if (sv == "c") + return NANOARROW_TYPE_INT8; + else if (sv == "C") + return NANOARROW_TYPE_UINT8; + else if (sv == "s") + return NANOARROW_TYPE_INT16; + else if (sv == "S") + return NANOARROW_TYPE_UINT16; + else if (sv == "I") + return NANOARROW_TYPE_UINT32; + else if (sv == "l") + return NANOARROW_TYPE_INT64; + else if (sv == "L") + return NANOARROW_TYPE_UINT64; + else if (sv == "f") + return NANOARROW_TYPE_FLOAT; + else if (sv == "g") + return NANOARROW_TYPE_DOUBLE; + else if (sv == "u") + return NANOARROW_TYPE_STRING; + else if (sv == "U") + return NANOARROW_TYPE_LARGE_STRING; + else if (sv == "b") + return NANOARROW_TYPE_BOOL; + else if (sv == "tss:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "z") + return NANOARROW_TYPE_BINARY; + else if (sv == "Z") + return NANOARROW_TYPE_LARGE_BINARY; + else + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index a84a37506c..1d8ca8f6d4 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -9,9 +9,9 @@ // https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-simple-int32-array #include "nanoarrow.hpp" -//#ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED -//#include "carrow.h" -//#endif +// #ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED +// #include "carrow.h" +// #endif namespace tiledbsoma { diff --git a/libtiledbsoma/src/utils/nanoarrow.h b/libtiledbsoma/src/utils/nanoarrow.h index e338560f1a..91c1e90708 100644 --- a/libtiledbsoma/src/utils/nanoarrow.h +++ b/libtiledbsoma/src/utils/nanoarrow.h @@ -23,9 +23,9 @@ #define NANOARROW_VERSION_PATCH 0 #define NANOARROW_VERSION "0.4.0-SNAPSHOT" -#define NANOARROW_VERSION_INT \ - (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ - NANOARROW_VERSION_PATCH) +#define NANOARROW_VERSION_INT \ + (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ + NANOARROW_VERSION_PATCH) // #define NANOARROW_NAMESPACE YourNamespaceHere @@ -53,8 +53,6 @@ #include #include - - #if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) #include #include @@ -70,11 +68,11 @@ extern "C" { /// \defgroup nanoarrow-arrow-cdata Arrow C Data interface /// /// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) -/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) -/// interfaces are part of the -/// Arrow Columnar Format specification -/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for -/// documentation of these structures. +/// and Arrow C Stream +/// (https://arrow.apache.org/docs/format/CStreamInterface.html) interfaces are +/// part of the Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow +/// documentation for documentation of these structures. /// /// @{ @@ -86,36 +84,36 @@ extern "C" { #define ARROW_FLAG_MAP_KEYS_SORTED 4 struct ArrowSchema { - // Array type description - const char* format; - const char* name; - const char* metadata; - int64_t flags; - int64_t n_children; - struct ArrowSchema** children; - struct ArrowSchema* dictionary; - - // Release callback - void (*release)(struct ArrowSchema*); - // Opaque producer-specific data - void* private_data; + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; }; struct ArrowArray { - // Array data description - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void** buffers; - struct ArrowArray** children; - struct ArrowArray* dictionary; - - // Release callback - void (*release)(struct ArrowArray*); - // Opaque producer-specific data - void* private_data; + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; }; #endif // ARROW_C_DATA_INTERFACE @@ -124,39 +122,43 @@ struct ArrowArray { #define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { - // Callback to get the stream type - // (will be the same for all arrays in the stream). - // - // Return value: 0 if successful, an `errno`-compatible error code otherwise. - // - // If successful, the ArrowSchema must be released independently from the stream. - int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); - - // Callback to get the next array - // (if no error and the array is released, the stream has ended) - // - // Return value: 0 if successful, an `errno`-compatible error code otherwise. - // - // If successful, the ArrowArray must be released independently from the stream. - int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); - - // Callback to get optional detailed error information. - // This must only be called if the last stream operation failed - // with a non-0 return code. - // - // Return value: pointer to a null-terminated character array describing - // the last error, or NULL if no description is available. - // - // The returned pointer is only valid until the next operation on this stream - // (including release). - const char* (*get_last_error)(struct ArrowArrayStream*); - - // Release callback: release the stream's own resources. - // Note that arrays returned by `get_next` must be individually released. - void (*release)(struct ArrowArrayStream*); - - // Opaque producer-specific data - void* private_data; + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code + // otherwise. + // + // If successful, the ArrowSchema must be released independently from the + // stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code + // otherwise. + // + // If successful, the ArrowArray must be released independently from the + // stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this + // stream (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; }; #endif // ARROW_C_STREAM_INTERFACE @@ -169,44 +171,54 @@ struct ArrowArrayStream { #define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) #define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) return NAME; \ - } while (0) + do { \ + const int NAME = (EXPR); \ + if (NAME) \ + return NAME; \ + } while (0) #define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ - NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) + NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) #define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ - NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) #if defined(NANOARROW_DEBUG) -#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ - NAME, __FILE__, __LINE__); \ - return NAME; \ - } \ - } while (0) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet( \ + (ERROR_PTR_EXPR), \ + "%s failed with errno %d\n* %s:%d", \ + EXPR_STR, \ + NAME, \ + __FILE__, \ + __LINE__); \ + return NAME; \ + } \ + } while (0) #else -#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ - return NAME; \ - } \ - } while (0) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet( \ + (ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) #endif #if defined(NANOARROW_DEBUG) // For checking ArrowErrorSet() calls for valid printf format strings/arguments -// If using mingw's c99-compliant printf, we need a different format-checking attribute +// If using mingw's c99-compliant printf, we need a different format-checking +// attribute #if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) #define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ - __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) + __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) #elif defined(__GNUC__) #define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) #else @@ -243,24 +255,25 @@ typedef int ArrowErrorCode; /// \brief Flags supported by ArrowSchemaViewInit() /// \ingroup nanoarrow-schema-view -#define NANOARROW_FLAG_ALL_SUPPORTED \ - (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED) +#define NANOARROW_FLAG_ALL_SUPPORTED \ + (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | \ + ARROW_FLAG_MAP_KEYS_SORTED) /// \brief Error type containing a UTF-8 encoded message. /// \ingroup nanoarrow-errors struct ArrowError { - /// \brief A character buffer with space for an error message. - char message[1024]; + /// \brief A character buffer with space for an error message. + char message[1024]; }; -/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. -/// \ingroup nanoarrow-errors +/// \brief Ensure an ArrowError is null-terminated by zeroing the first +/// character. \ingroup nanoarrow-errors /// /// If error is NULL, this function does nothing. static inline void ArrowErrorInit(struct ArrowError* error) { - if (error != NULL) { - error->message[0] = '\0'; - } + if (error != NULL) { + error->message[0] = '\0'; + } } /// \brief Get the contents of an error @@ -269,79 +282,93 @@ static inline void ArrowErrorInit(struct ArrowError* error) { /// If error is NULL, returns "", or returns the contents of the error message /// otherwise. static inline const char* ArrowErrorMessage(struct ArrowError* error) { - if (error == NULL) { - return ""; - } else { - return error->message; - } + if (error == NULL) { + return ""; + } else { + return error->message; + } } /// \brief Set the contents of an error from an existing null-terminated string /// \ingroup nanoarrow-errors /// /// If error is NULL, this function does nothing. -static inline void ArrowErrorSetString(struct ArrowError* error, const char* src) { - if (error == NULL) { - return; - } +static inline void ArrowErrorSetString( + struct ArrowError* error, const char* src) { + if (error == NULL) { + return; + } - int64_t src_len = strlen(src); - if (src_len >= ((int64_t)sizeof(error->message))) { - memcpy(error->message, src, sizeof(error->message) - 1); - error->message[sizeof(error->message) - 1] = '\0'; - } else { - memcpy(error->message, src, src_len); - error->message[src_len] = '\0'; - } + int64_t src_len = strlen(src); + if (src_len >= ((int64_t)sizeof(error->message))) { + memcpy(error->message, src, sizeof(error->message) - 1); + error->message[sizeof(error->message) - 1] = '\0'; + } else { + memcpy(error->message, src, src_len); + error->message[src_len] = '\0'; + } } /// \brief Check the result of an expression and return it if not NANOARROW_OK /// \ingroup nanoarrow-errors #define NANOARROW_RETURN_NOT_OK(EXPR) \ - _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) + _NANOARROW_RETURN_NOT_OK_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) /// \brief Check the result of an expression and return it if not NANOARROW_OK, /// adding an auto-generated message to an ArrowError. /// \ingroup nanoarrow-errors /// /// This macro is used to ensure that functions that accept an ArrowError -/// as input always set its message when returning an error code (e.g., when calling -/// a nanoarrow function that does *not* accept ArrowError). +/// as input always set its message when returning an error code (e.g., when +/// calling a nanoarrow function that does *not* accept ArrowError). #define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ - _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ - _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), \ + EXPR, \ + ERROR_EXPR, \ + #EXPR) #if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) -#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ - do { \ - fprintf(stderr, "%s failed with code %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ - __FILE__, (int)__LINE__); \ - abort(); \ - } while (0) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf( \ + stderr, \ + "%s failed with code %d\n* %s:%d\n", \ + EXPR_STR, \ + (int)(VALUE), \ + __FILE__, \ + (int)__LINE__); \ + abort(); \ + } while (0) #endif #if defined(NANOARROW_DEBUG) #define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ - } while (0) + do { \ + const int NAME = (EXPR); \ + if (NAME) \ + NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) /// \brief Assert that an expression's value is NANOARROW_OK /// \ingroup nanoarrow-errors /// -/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), -/// print a message to stderr and abort. If nanoarrow was built in release mode, -/// this statement has no effect. You can customize fatal error behaviour -/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h -/// This macro is provided as a convenience for users and is not used internally. +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is +/// true), print a message to stderr and abort. If nanoarrow was built in +/// release mode, this statement has no effect. You can customize fatal error +/// behaviour be defining the NANOARROW_PRINT_AND_DIE macro before including +/// nanoarrow.h This macro is provided as a convenience for users and is not +/// used internally. #define NANOARROW_ASSERT_OK(EXPR) \ - _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) + _NANOARROW_ASSERT_OK_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) -#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ - do { \ - if (!(EXPR)) NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ - } while (0) +#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ + do { \ + if (!(EXPR)) \ + NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ + } while (0) #define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) #else @@ -349,92 +376,97 @@ static inline void ArrowErrorSetString(struct ArrowError* error, const char* src #define NANOARROW_DCHECK(EXPR) #endif -static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { - NANOARROW_DCHECK(src != NULL); - NANOARROW_DCHECK(dst != NULL); +static inline void ArrowSchemaMove( + struct ArrowSchema* src, struct ArrowSchema* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); - memcpy(dst, src, sizeof(struct ArrowSchema)); - src->release = NULL; + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; } static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { - NANOARROW_DCHECK(schema != NULL); - schema->release(schema); - NANOARROW_DCHECK(schema->release == NULL); + NANOARROW_DCHECK(schema != NULL); + schema->release(schema); + NANOARROW_DCHECK(schema->release == NULL); } -static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { - NANOARROW_DCHECK(src != NULL); - NANOARROW_DCHECK(dst != NULL); +static inline void ArrowArrayMove( + struct ArrowArray* src, struct ArrowArray* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); - memcpy(dst, src, sizeof(struct ArrowArray)); - src->release = NULL; + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; } static inline void ArrowArrayRelease(struct ArrowArray* array) { - NANOARROW_DCHECK(array != NULL); - array->release(array); - NANOARROW_DCHECK(array->release == NULL); + NANOARROW_DCHECK(array != NULL); + array->release(array); + NANOARROW_DCHECK(array->release == NULL); } -static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, - struct ArrowArrayStream* dst) { - NANOARROW_DCHECK(src != NULL); - NANOARROW_DCHECK(dst != NULL); +static inline void ArrowArrayStreamMove( + struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); - memcpy(dst, src, sizeof(struct ArrowArrayStream)); - src->release = NULL; + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; } static inline const char* ArrowArrayStreamGetLastError( struct ArrowArrayStream* array_stream) { - NANOARROW_DCHECK(array_stream != NULL); + NANOARROW_DCHECK(array_stream != NULL); - const char* value = array_stream->get_last_error(array_stream); - if (value == NULL) { - return ""; - } else { - return value; - } + const char* value = array_stream->get_last_error(array_stream); + if (value == NULL) { + return ""; + } else { + return value; + } } static inline ArrowErrorCode ArrowArrayStreamGetSchema( - struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowArrayStream* array_stream, + struct ArrowSchema* out, struct ArrowError* error) { - NANOARROW_DCHECK(array_stream != NULL); + NANOARROW_DCHECK(array_stream != NULL); - int result = array_stream->get_schema(array_stream, out); - if (result != NANOARROW_OK && error != NULL) { - ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); - } + int result = array_stream->get_schema(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } - return result; + return result; } static inline ArrowErrorCode ArrowArrayStreamGetNext( - struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowArrayStream* array_stream, + struct ArrowArray* out, struct ArrowError* error) { - NANOARROW_DCHECK(array_stream != NULL); + NANOARROW_DCHECK(array_stream != NULL); - int result = array_stream->get_next(array_stream, out); - if (result != NANOARROW_OK && error != NULL) { - ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); - } + int result = array_stream->get_next(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } - return result; + return result; } -static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream) { - NANOARROW_DCHECK(array_stream != NULL); - array_stream->release(array_stream); - NANOARROW_DCHECK(array_stream->release == NULL); +static inline void ArrowArrayStreamRelease( + struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + array_stream->release(array_stream); + NANOARROW_DCHECK(array_stream->release == NULL); } static char _ArrowIsLittleEndian(void) { - uint32_t check = 1; - char first_byte; - memcpy(&first_byte, &check, sizeof(char)); - return first_byte; + uint32_t check = 1; + char first_byte; + memcpy(&first_byte, &check, sizeof(char)); + return first_byte; } /// \brief Arrow type enumerator @@ -444,45 +476,45 @@ static char _ArrowIsLittleEndian(void) { /// enumerator; however, the numeric values are specifically not equal /// (i.e., do not rely on numeric comparison). enum ArrowType { - NANOARROW_TYPE_UNINITIALIZED = 0, - NANOARROW_TYPE_NA = 1, - NANOARROW_TYPE_BOOL, - NANOARROW_TYPE_UINT8, - NANOARROW_TYPE_INT8, - NANOARROW_TYPE_UINT16, - NANOARROW_TYPE_INT16, - NANOARROW_TYPE_UINT32, - NANOARROW_TYPE_INT32, - NANOARROW_TYPE_UINT64, - NANOARROW_TYPE_INT64, - NANOARROW_TYPE_HALF_FLOAT, - NANOARROW_TYPE_FLOAT, - NANOARROW_TYPE_DOUBLE, - NANOARROW_TYPE_STRING, - NANOARROW_TYPE_BINARY, - NANOARROW_TYPE_FIXED_SIZE_BINARY, - NANOARROW_TYPE_DATE32, - NANOARROW_TYPE_DATE64, - NANOARROW_TYPE_TIMESTAMP, - NANOARROW_TYPE_TIME32, - NANOARROW_TYPE_TIME64, - NANOARROW_TYPE_INTERVAL_MONTHS, - NANOARROW_TYPE_INTERVAL_DAY_TIME, - NANOARROW_TYPE_DECIMAL128, - NANOARROW_TYPE_DECIMAL256, - NANOARROW_TYPE_LIST, - NANOARROW_TYPE_STRUCT, - NANOARROW_TYPE_SPARSE_UNION, - NANOARROW_TYPE_DENSE_UNION, - NANOARROW_TYPE_DICTIONARY, - NANOARROW_TYPE_MAP, - NANOARROW_TYPE_EXTENSION, - NANOARROW_TYPE_FIXED_SIZE_LIST, - NANOARROW_TYPE_DURATION, - NANOARROW_TYPE_LARGE_STRING, - NANOARROW_TYPE_LARGE_BINARY, - NANOARROW_TYPE_LARGE_LIST, - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO + NANOARROW_TYPE_UNINITIALIZED = 0, + NANOARROW_TYPE_NA = 1, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_UINT8, + NANOARROW_TYPE_INT8, + NANOARROW_TYPE_UINT16, + NANOARROW_TYPE_INT16, + NANOARROW_TYPE_UINT32, + NANOARROW_TYPE_INT32, + NANOARROW_TYPE_UINT64, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_HALF_FLOAT, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_BINARY, + NANOARROW_TYPE_FIXED_SIZE_BINARY, + NANOARROW_TYPE_DATE32, + NANOARROW_TYPE_DATE64, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_INTERVAL_MONTHS, + NANOARROW_TYPE_INTERVAL_DAY_TIME, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256, + NANOARROW_TYPE_LIST, + NANOARROW_TYPE_STRUCT, + NANOARROW_TYPE_SPARSE_UNION, + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_DICTIONARY, + NANOARROW_TYPE_MAP, + NANOARROW_TYPE_EXTENSION, + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_LARGE_STRING, + NANOARROW_TYPE_LARGE_BINARY, + NANOARROW_TYPE_LARGE_LIST, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO }; /// \brief Get a string value of an enum ArrowType value @@ -492,86 +524,86 @@ enum ArrowType { static inline const char* ArrowTypeString(enum ArrowType type); static inline const char* ArrowTypeString(enum ArrowType type) { - switch (type) { - case NANOARROW_TYPE_NA: - return "na"; - case NANOARROW_TYPE_BOOL: - return "bool"; - case NANOARROW_TYPE_UINT8: - return "uint8"; - case NANOARROW_TYPE_INT8: - return "int8"; - case NANOARROW_TYPE_UINT16: - return "uint16"; - case NANOARROW_TYPE_INT16: - return "int16"; - case NANOARROW_TYPE_UINT32: - return "uint32"; - case NANOARROW_TYPE_INT32: - return "int32"; - case NANOARROW_TYPE_UINT64: - return "uint64"; - case NANOARROW_TYPE_INT64: - return "int64"; - case NANOARROW_TYPE_HALF_FLOAT: - return "half_float"; - case NANOARROW_TYPE_FLOAT: - return "float"; - case NANOARROW_TYPE_DOUBLE: - return "double"; - case NANOARROW_TYPE_STRING: - return "string"; - case NANOARROW_TYPE_BINARY: - return "binary"; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - return "fixed_size_binary"; - case NANOARROW_TYPE_DATE32: - return "date32"; - case NANOARROW_TYPE_DATE64: - return "date64"; - case NANOARROW_TYPE_TIMESTAMP: - return "timestamp"; - case NANOARROW_TYPE_TIME32: - return "time32"; - case NANOARROW_TYPE_TIME64: - return "time64"; - case NANOARROW_TYPE_INTERVAL_MONTHS: - return "interval_months"; - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - return "interval_day_time"; - case NANOARROW_TYPE_DECIMAL128: - return "decimal128"; - case NANOARROW_TYPE_DECIMAL256: - return "decimal256"; - case NANOARROW_TYPE_LIST: - return "list"; - case NANOARROW_TYPE_STRUCT: - return "struct"; - case NANOARROW_TYPE_SPARSE_UNION: - return "sparse_union"; - case NANOARROW_TYPE_DENSE_UNION: - return "dense_union"; - case NANOARROW_TYPE_DICTIONARY: - return "dictionary"; - case NANOARROW_TYPE_MAP: - return "map"; - case NANOARROW_TYPE_EXTENSION: - return "extension"; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - return "fixed_size_list"; - case NANOARROW_TYPE_DURATION: - return "duration"; - case NANOARROW_TYPE_LARGE_STRING: - return "large_string"; - case NANOARROW_TYPE_LARGE_BINARY: - return "large_binary"; - case NANOARROW_TYPE_LARGE_LIST: - return "large_list"; - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - return "interval_month_day_nano"; - default: - return NULL; - } + switch (type) { + case NANOARROW_TYPE_NA: + return "na"; + case NANOARROW_TYPE_BOOL: + return "bool"; + case NANOARROW_TYPE_UINT8: + return "uint8"; + case NANOARROW_TYPE_INT8: + return "int8"; + case NANOARROW_TYPE_UINT16: + return "uint16"; + case NANOARROW_TYPE_INT16: + return "int16"; + case NANOARROW_TYPE_UINT32: + return "uint32"; + case NANOARROW_TYPE_INT32: + return "int32"; + case NANOARROW_TYPE_UINT64: + return "uint64"; + case NANOARROW_TYPE_INT64: + return "int64"; + case NANOARROW_TYPE_HALF_FLOAT: + return "half_float"; + case NANOARROW_TYPE_FLOAT: + return "float"; + case NANOARROW_TYPE_DOUBLE: + return "double"; + case NANOARROW_TYPE_STRING: + return "string"; + case NANOARROW_TYPE_BINARY: + return "binary"; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return "fixed_size_binary"; + case NANOARROW_TYPE_DATE32: + return "date32"; + case NANOARROW_TYPE_DATE64: + return "date64"; + case NANOARROW_TYPE_TIMESTAMP: + return "timestamp"; + case NANOARROW_TYPE_TIME32: + return "time32"; + case NANOARROW_TYPE_TIME64: + return "time64"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "interval_months"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "interval_day_time"; + case NANOARROW_TYPE_DECIMAL128: + return "decimal128"; + case NANOARROW_TYPE_DECIMAL256: + return "decimal256"; + case NANOARROW_TYPE_LIST: + return "list"; + case NANOARROW_TYPE_STRUCT: + return "struct"; + case NANOARROW_TYPE_SPARSE_UNION: + return "sparse_union"; + case NANOARROW_TYPE_DENSE_UNION: + return "dense_union"; + case NANOARROW_TYPE_DICTIONARY: + return "dictionary"; + case NANOARROW_TYPE_MAP: + return "map"; + case NANOARROW_TYPE_EXTENSION: + return "extension"; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return "fixed_size_list"; + case NANOARROW_TYPE_DURATION: + return "duration"; + case NANOARROW_TYPE_LARGE_STRING: + return "large_string"; + case NANOARROW_TYPE_LARGE_BINARY: + return "large_binary"; + case NANOARROW_TYPE_LARGE_LIST: + return "large_list"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "interval_month_day_nano"; + default: + return NULL; + } } /// \brief Arrow time unit enumerator @@ -580,29 +612,34 @@ static inline const char* ArrowTypeString(enum ArrowType type) { /// These names and values map to the corresponding arrow::TimeUnit::type /// enumerator. enum ArrowTimeUnit { - NANOARROW_TIME_UNIT_SECOND = 0, - NANOARROW_TIME_UNIT_MILLI = 1, - NANOARROW_TIME_UNIT_MICRO = 2, - NANOARROW_TIME_UNIT_NANO = 3 + NANOARROW_TIME_UNIT_SECOND = 0, + NANOARROW_TIME_UNIT_MILLI = 1, + NANOARROW_TIME_UNIT_MICRO = 2, + NANOARROW_TIME_UNIT_NANO = 3 }; /// \brief Validation level enumerator /// \ingroup nanoarrow-array enum ArrowValidationLevel { - /// \brief Do not validate buffer sizes or content. - NANOARROW_VALIDATION_LEVEL_NONE = 0, - - /// \brief Validate buffer sizes that depend on array length but do not validate buffer - /// sizes that depend on buffer data access. - NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, - - /// \brief Validate all buffer sizes, including those that require buffer data access, - /// but do not perform any checks that are O(1) along the length of the buffers. - NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, - - /// \brief Validate all buffer sizes and all buffer content. This is useful in the - /// context of untrusted input or input that may have been corrupted in transit. - NANOARROW_VALIDATION_LEVEL_FULL = 3 + /// \brief Do not validate buffer sizes or content. + NANOARROW_VALIDATION_LEVEL_NONE = 0, + + /// \brief Validate buffer sizes that depend on array length but do not + /// validate buffer + /// sizes that depend on buffer data access. + NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, + + /// \brief Validate all buffer sizes, including those that require buffer + /// data access, + /// but do not perform any checks that are O(1) along the length of the + /// buffers. + NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, + + /// \brief Validate all buffer sizes and all buffer content. This is useful + /// in the + /// context of untrusted input or input that may have been corrupted in + /// transit. + NANOARROW_VALIDATION_LEVEL_FULL = 3 }; /// \brief Get a string value of an enum ArrowTimeUnit value @@ -612,29 +649,29 @@ enum ArrowValidationLevel { static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { - switch (time_unit) { - case NANOARROW_TIME_UNIT_SECOND: - return "s"; - case NANOARROW_TIME_UNIT_MILLI: - return "ms"; - case NANOARROW_TIME_UNIT_MICRO: - return "us"; - case NANOARROW_TIME_UNIT_NANO: - return "ns"; - default: - return NULL; - } -} - -/// \brief Functional types of buffers as described in the Arrow Columnar Specification -/// \ingroup nanoarrow-array-view + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "ms"; + case NANOARROW_TIME_UNIT_MICRO: + return "us"; + case NANOARROW_TIME_UNIT_NANO: + return "ns"; + default: + return NULL; + } +} + +/// \brief Functional types of buffers as described in the Arrow Columnar +/// Specification \ingroup nanoarrow-array-view enum ArrowBufferType { - NANOARROW_BUFFER_TYPE_NONE, - NANOARROW_BUFFER_TYPE_VALIDITY, - NANOARROW_BUFFER_TYPE_TYPE_ID, - NANOARROW_BUFFER_TYPE_UNION_OFFSET, - NANOARROW_BUFFER_TYPE_DATA_OFFSET, - NANOARROW_BUFFER_TYPE_DATA + NANOARROW_BUFFER_TYPE_NONE, + NANOARROW_BUFFER_TYPE_VALIDITY, + NANOARROW_BUFFER_TYPE_TYPE_ID, + NANOARROW_BUFFER_TYPE_UNION_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA }; /// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout @@ -642,21 +679,22 @@ enum ArrowBufferType { /// /// All currently supported types have 3 buffers or fewer; however, future types /// may involve a variable number of buffers (e.g., string view). These buffers -/// will be represented by separate members of the ArrowArrayView or ArrowLayout. +/// will be represented by separate members of the ArrowArrayView or +/// ArrowLayout. #define NANOARROW_MAX_FIXED_BUFFERS 3 /// \brief An non-owning view of a string /// \ingroup nanoarrow-utils struct ArrowStringView { - /// \brief A pointer to the start of the string - /// - /// If size_bytes is 0, this value may be NULL. - const char* data; - - /// \brief The size of the string in bytes, - /// - /// (Not including the null terminator.) - int64_t size_bytes; + /// \brief A pointer to the start of the string + /// + /// If size_bytes is 0, this value may be NULL. + const char* data; + + /// \brief The size of the string in bytes, + /// + /// (Not including the null terminator.) + int64_t size_bytes; }; /// \brief Return a view of a const C string @@ -664,43 +702,43 @@ struct ArrowStringView { static inline struct ArrowStringView ArrowCharView(const char* value); static inline struct ArrowStringView ArrowCharView(const char* value) { - struct ArrowStringView out; + struct ArrowStringView out; - out.data = value; - if (value) { - out.size_bytes = (int64_t)strlen(value); - } else { - out.size_bytes = 0; - } + out.data = value; + if (value) { + out.size_bytes = (int64_t)strlen(value); + } else { + out.size_bytes = 0; + } - return out; + return out; } union ArrowBufferViewData { - const void* data; - const int8_t* as_int8; - const uint8_t* as_uint8; - const int16_t* as_int16; - const uint16_t* as_uint16; - const int32_t* as_int32; - const uint32_t* as_uint32; - const int64_t* as_int64; - const uint64_t* as_uint64; - const double* as_double; - const float* as_float; - const char* as_char; + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; }; /// \brief An non-owning view of a buffer /// \ingroup nanoarrow-utils struct ArrowBufferView { - /// \brief A pointer to the start of the buffer - /// - /// If size_bytes is 0, this value may be NULL. - union ArrowBufferViewData data; + /// \brief A pointer to the start of the buffer + /// + /// If size_bytes is 0, this value may be NULL. + union ArrowBufferViewData data; - /// \brief The size of the buffer in bytes - int64_t size_bytes; + /// \brief The size of the buffer in bytes + int64_t size_bytes; }; /// \brief Array buffer allocation and deallocation @@ -710,43 +748,48 @@ struct ArrowBufferView { /// to customize allocation and deallocation of buffers when constructing /// an ArrowArray. struct ArrowBufferAllocator { - /// \brief Reallocate a buffer or return NULL if it cannot be reallocated - uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t old_size, int64_t new_size); - - /// \brief Deallocate a buffer allocated by this allocator - void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); - - /// \brief Opaque data specific to the allocator - void* private_data; + /// \brief Reallocate a buffer or return NULL if it cannot be reallocated + uint8_t* (*reallocate)( + struct ArrowBufferAllocator* allocator, + uint8_t* ptr, + int64_t old_size, + int64_t new_size); + + /// \brief Deallocate a buffer allocated by this allocator + void (*free)( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); + + /// \brief Opaque data specific to the allocator + void* private_data; }; /// \brief An owning mutable view of a buffer /// \ingroup nanoarrow-buffer struct ArrowBuffer { - /// \brief A pointer to the start of the buffer - /// - /// If capacity_bytes is 0, this value may be NULL. - uint8_t* data; + /// \brief A pointer to the start of the buffer + /// + /// If capacity_bytes is 0, this value may be NULL. + uint8_t* data; - /// \brief The size of the buffer in bytes - int64_t size_bytes; + /// \brief The size of the buffer in bytes + int64_t size_bytes; - /// \brief The capacity of the buffer in bytes - int64_t capacity_bytes; + /// \brief The capacity of the buffer in bytes + int64_t capacity_bytes; - /// \brief The allocator that will be used to reallocate and/or free the buffer - struct ArrowBufferAllocator allocator; + /// \brief The allocator that will be used to reallocate and/or free the + /// buffer + struct ArrowBufferAllocator allocator; }; /// \brief An owning mutable view of a bitmap /// \ingroup nanoarrow-bitmap struct ArrowBitmap { - /// \brief An ArrowBuffer to hold the allocated memory - struct ArrowBuffer buffer; + /// \brief An ArrowBuffer to hold the allocated memory + struct ArrowBuffer buffer; - /// \brief The number of bits that have been appended to the bitmap - int64_t size_bits; + /// \brief The number of bits that have been appended to the bitmap + int64_t size_bits; }; /// \brief A description of an arrangement of buffers @@ -756,18 +799,19 @@ struct ArrowBitmap { /// calculate the size of each buffer in an ArrowArray knowing only /// the length and offset of the array. struct ArrowLayout { - /// \brief The function of each buffer - enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; + /// \brief The function of each buffer + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; - /// \brief The data type of each buffer - enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; - /// \brief The size of an element each buffer or 0 if this size is variable or unknown - int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; + /// \brief The size of an element each buffer or 0 if this size is variable + /// or unknown + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; - /// \brief The number of elements in the child array per element in this array for a - /// fixed-size list - int64_t child_size_elements; + /// \brief The number of elements in the child array per element in this + /// array for a fixed-size list + int64_t child_size_elements; }; /// \brief A non-owning view of an ArrowArray @@ -780,99 +824,100 @@ struct ArrowLayout { /// ArrowArray that does not exist yet, or use it to validate the buffers /// of a future ArrowArray. struct ArrowArrayView { - /// \brief The underlying ArrowArray or NULL if it has not been set or - /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. - const struct ArrowArray* array; + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + const struct ArrowArray* array; - /// \brief The number of elements from the physical start of the buffers. - int64_t offset; + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; - /// \brief The number of elements in this view. - int64_t length; + /// \brief The number of elements in this view. + int64_t length; - /// \brief A cached null count or -1 to indicate that this value is unknown. - int64_t null_count; + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; - /// \brief The type used to store values in this array - /// - /// This type represents only the minimum required information to - /// extract values from the array buffers (e.g., for a Date32 array, - /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded - /// arrays, this will be the index type. - enum ArrowType storage_type; + /// \brief The type used to store values in this array + /// + /// This type represents only the minimum required information to + /// extract values from the array buffers (e.g., for a Date32 array, + /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded + /// arrays, this will be the index type. + enum ArrowType storage_type; - /// \brief The buffer types, strides, and sizes of this Array's buffers - struct ArrowLayout layout; + /// \brief The buffer types, strides, and sizes of this Array's buffers + struct ArrowLayout layout; - /// \brief This Array's buffers as ArrowBufferView objects - struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; + /// \brief This Array's buffers as ArrowBufferView objects + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; - /// \brief The number of children of this view - int64_t n_children; + /// \brief The number of children of this view + int64_t n_children; - /// \brief Pointers to views of this array's children - struct ArrowArrayView** children; + /// \brief Pointers to views of this array's children + struct ArrowArrayView** children; - /// \brief Pointer to a view of this array's dictionary - struct ArrowArrayView* dictionary; + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; - /// \brief Union type id to child index mapping - /// - /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer - /// such that child_index == union_type_id_map[type_id] and - /// type_id == union_type_id_map[128 + child_index]. This value may be - /// NULL in the case where child_id == type_id. - int8_t* union_type_id_map; + /// \brief Union type id to child index mapping + /// + /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer + /// such that child_index == union_type_id_map[type_id] and + /// type_id == union_type_id_map[128 + child_index]. This value may be + /// NULL in the case where child_id == type_id. + int8_t* union_type_id_map; }; // Used as the private data member for ArrowArrays allocated here and accessed // internally within inline ArrowArray* helpers. struct ArrowArrayPrivateData { - // Holder for the validity buffer (or first buffer for union types, which are - // the only type whose first buffer is not a valdiity buffer) - struct ArrowBitmap bitmap; + // Holder for the validity buffer (or first buffer for union types, which + // are the only type whose first buffer is not a valdiity buffer) + struct ArrowBitmap bitmap; - // Holder for additional buffers as required - struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; + // Holder for additional buffers as required + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; - // The array of pointers to buffers. This must be updated after a sequence - // of appends to synchronize its values with the actual buffer addresses - // (which may have ben reallocated uring that time) - const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; + // The array of pointers to buffers. This must be updated after a sequence + // of appends to synchronize its values with the actual buffer addresses + // (which may have ben reallocated uring that time) + const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; - // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown - enum ArrowType storage_type; + // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown + enum ArrowType storage_type; - // The buffer arrangement for the storage type - struct ArrowLayout layout; + // The buffer arrangement for the storage type + struct ArrowLayout layout; - // Flag to indicate if there are non-sequence union type ids. - // In the future this could be replaced with a type id<->child mapping - // to support constructing unions in append mode where type_id != child_index - int8_t union_type_id_is_child_index; + // Flag to indicate if there are non-sequence union type ids. + // In the future this could be replaced with a type id<->child mapping + // to support constructing unions in append mode where type_id != + // child_index + int8_t union_type_id_is_child_index; }; /// \brief A representation of an interval. /// \ingroup nanoarrow-utils struct ArrowInterval { - /// \brief The type of interval being used - enum ArrowType type; - /// \brief The number of months represented by the interval - int32_t months; - /// \brief The number of days represented by the interval - int32_t days; - /// \brief The number of ms represented by the interval - int32_t ms; - /// \brief The number of ns represented by the interval - int64_t ns; + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; }; /// \brief Zero initialize an Interval with a given unit /// \ingroup nanoarrow-utils -static inline void ArrowIntervalInit(struct ArrowInterval* interval, - enum ArrowType type) { - memset(interval, 0, sizeof(struct ArrowInterval)); - interval->type = type; +static inline void ArrowIntervalInit( + struct ArrowInterval* interval, enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; } /// \brief A representation of a fixed-precision decimal number @@ -882,41 +927,47 @@ static inline void ArrowIntervalInit(struct ArrowInterval* interval, /// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), /// or ArrowDecimalSetBytes256(). struct ArrowDecimal { - /// \brief An array of 64-bit integers of n_words length defined in native-endian order - uint64_t words[4]; + /// \brief An array of 64-bit integers of n_words length defined in + /// native-endian order + uint64_t words[4]; - /// \brief The number of significant digits this decimal number can represent - int32_t precision; + /// \brief The number of significant digits this decimal number can + /// represent + int32_t precision; - /// \brief The number of digits after the decimal point. This can be negative. - int32_t scale; + /// \brief The number of digits after the decimal point. This can be + /// negative. + int32_t scale; - /// \brief The number of words in the words array - int n_words; + /// \brief The number of words in the words array + int n_words; - /// \brief Cached value used by the implementation - int high_word_index; + /// \brief Cached value used by the implementation + int high_word_index; - /// \brief Cached value used by the implementation - int low_word_index; + /// \brief Cached value used by the implementation + int low_word_index; }; /// \brief Initialize a decimal with a given set of type parameters /// \ingroup nanoarrow-utils -static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, - int32_t precision, int32_t scale) { - memset(decimal->words, 0, sizeof(decimal->words)); - decimal->precision = precision; - decimal->scale = scale; - decimal->n_words = bitwidth / 8 / sizeof(uint64_t); - - if (_ArrowIsLittleEndian()) { - decimal->low_word_index = 0; - decimal->high_word_index = decimal->n_words - 1; - } else { - decimal->low_word_index = decimal->n_words - 1; - decimal->high_word_index = 0; - } +static inline void ArrowDecimalInit( + struct ArrowDecimal* decimal, + int32_t bitwidth, + int32_t precision, + int32_t scale) { + memset(decimal->words, 0, sizeof(decimal->words)); + decimal->precision = precision; + decimal->scale = scale; + decimal->n_words = bitwidth / 8 / sizeof(uint64_t); + + if (_ArrowIsLittleEndian()) { + decimal->low_word_index = 0; + decimal->high_word_index = decimal->n_words - 1; + } else { + decimal->low_word_index = decimal->n_words - 1; + decimal->high_word_index = 0; + } } /// \brief Get a signed integer value of a sufficiently small ArrowDecimal @@ -924,62 +975,64 @@ static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwid /// This does not check if the decimal's precision sufficiently small to fit /// within the signed 64-bit integer range (A precision less than or equal /// to 18 is sufficiently small). -static inline int64_t ArrowDecimalGetIntUnsafe(const struct ArrowDecimal* decimal) { - return (int64_t)decimal->words[decimal->low_word_index]; +static inline int64_t ArrowDecimalGetIntUnsafe( + const struct ArrowDecimal* decimal) { + return (int64_t)decimal->words[decimal->low_word_index]; } /// \brief Copy the bytes of this decimal into a sufficiently large buffer /// \ingroup nanoarrow-utils -static inline void ArrowDecimalGetBytes(const struct ArrowDecimal* decimal, - uint8_t* out) { - memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); +static inline void ArrowDecimalGetBytes( + const struct ArrowDecimal* decimal, uint8_t* out) { + memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); } /// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise /// \ingroup nanoarrow-utils static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { - return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); + return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); } /// \brief Sets the integer value of this decimal /// \ingroup nanoarrow-utils -static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { - if (value < 0) { - memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); - } else { - memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); - } +static inline void ArrowDecimalSetInt( + struct ArrowDecimal* decimal, int64_t value) { + if (value < 0) { + memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); + } else { + memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); + } - decimal->words[decimal->low_word_index] = value; + decimal->words[decimal->low_word_index] = value; } /// \brief Negate the value of this decimal in place /// \ingroup nanoarrow-utils static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { - uint64_t carry = 1; - - if (decimal->low_word_index == 0) { - for (int i = 0; i < decimal->n_words; i++) { - uint64_t elem = decimal->words[i]; - elem = ~elem + carry; - carry &= (elem == 0); - decimal->words[i] = elem; - } - } else { - for (int i = decimal->low_word_index; i >= 0; i--) { - uint64_t elem = decimal->words[i]; - elem = ~elem + carry; - carry &= (elem == 0); - decimal->words[i] = elem; + uint64_t carry = 1; + + if (decimal->low_word_index == 0) { + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } else { + for (int i = decimal->low_word_index; i >= 0; i--) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } } - } } /// \brief Copy bytes from a buffer into this decimal /// \ingroup nanoarrow-utils -static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, - const uint8_t* value) { - memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); +static inline void ArrowDecimalSetBytes( + struct ArrowDecimal* decimal, const uint8_t* value) { + memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); } #ifdef __cplusplus @@ -1011,11 +1064,9 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, #include #include - - -// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this -// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE -// MyNamespace here. +// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will +// set this define in nanoarrow_config.h. If not, you can optionally #define +// NANOARROW_NAMESPACE MyNamespace here. // This section remaps the non-prefixed symbols to the prefixed symbols so that // code written against this build can be used independent of the value of @@ -1024,104 +1075,118 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, #define NANOARROW_CAT(A, B) A##B #define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) -#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) +#define ArrowNanoarrowVersion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) #define ArrowNanoarrowVersionInt \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) #define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) #define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) #define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) #define ArrowBufferAllocatorDefault \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) #define ArrowBufferDeallocator \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) #define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) #define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) -#define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) +#define ArrowDecimalSetDigits \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) #define ArrowDecimalAppendDigitsToBuffer \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) #define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) #define ArrowSchemaInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) -#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) +#define ArrowSchemaSetType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) #define ArrowSchemaSetTypeStruct \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) #define ArrowSchemaSetTypeFixedSize \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) #define ArrowSchemaSetTypeDecimal \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) #define ArrowSchemaSetTypeDateTime \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) #define ArrowSchemaSetTypeUnion \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) -#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) -#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) -#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) +#define ArrowSchemaDeepCopy \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) +#define ArrowSchemaSetFormat \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) +#define ArrowSchemaSetName \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) #define ArrowSchemaSetMetadata \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) #define ArrowSchemaAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) #define ArrowSchemaAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) #define ArrowMetadataReaderInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) #define ArrowMetadataReaderRead \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) -#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) -#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) -#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) +#define ArrowMetadataSizeOf \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) +#define ArrowMetadataHasKey \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) +#define ArrowMetadataGetValue \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) #define ArrowMetadataBuilderInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) #define ArrowMetadataBuilderAppend \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) #define ArrowMetadataBuilderSet \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) #define ArrowMetadataBuilderRemove \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) -#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) -#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) +#define ArrowSchemaViewInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) +#define ArrowSchemaToString \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) #define ArrowArrayInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) #define ArrowArrayInitFromSchema \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) #define ArrowArrayInitFromArrayView \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) #define ArrowArrayInitFromArrayView \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) #define ArrowArrayAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) #define ArrowArrayAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) #define ArrowArraySetValidityBitmap \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) -#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) -#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) +#define ArrowArraySetBuffer \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) +#define ArrowArrayReserve \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) #define ArrowArrayFinishBuilding \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) #define ArrowArrayFinishBuildingDefault \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) #define ArrowArrayViewInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) #define ArrowArrayViewInitFromSchema \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) #define ArrowArrayViewAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) #define ArrowArrayViewAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) #define ArrowArrayViewSetLength \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) #define ArrowArrayViewSetArray \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) #define ArrowArrayViewSetArrayMinimal \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) #define ArrowArrayViewValidate \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) -#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewReset \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) #define ArrowBasicArrayStreamInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) #define ArrowBasicArrayStreamSetArray \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) #define ArrowBasicArrayStreamValidate \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) #endif @@ -1169,31 +1234,33 @@ struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); /// avoid copying an existing buffer that was not allocated using the /// infrastructure provided here (e.g., by an R or Python object). struct ArrowBufferAllocator ArrowBufferDeallocator( - void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t size), + void (*custom_free)( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size), void* private_data); /// @} -/// \brief Move the contents of an src ArrowSchema into dst and set src->release to NULL -/// \ingroup nanoarrow-arrow-cdata -static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst); +/// \brief Move the contents of an src ArrowSchema into dst and set src->release +/// to NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaMove( + struct ArrowSchema* src, struct ArrowSchema* dst); /// \brief Call the release callback of an ArrowSchema /// \ingroup nanoarrow-arrow-cdata static inline void ArrowSchemaRelease(struct ArrowSchema* schema); -/// \brief Move the contents of an src ArrowArray into dst and set src->release to NULL -/// \ingroup nanoarrow-arrow-cdata -static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst); +/// \brief Move the contents of an src ArrowArray into dst and set src->release +/// to NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayMove( + struct ArrowArray* src, struct ArrowArray* dst); /// \brief Call the release callback of an ArrowArray static inline void ArrowArrayRelease(struct ArrowArray* array); -/// \brief Move the contents of an src ArrowArrayStream into dst and set src->release to -/// NULL \ingroup nanoarrow-arrow-cdata -static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, - struct ArrowArrayStream* dst); +/// \brief Move the contents of an src ArrowArrayStream into dst and set +/// src->release to NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayStreamMove( + struct ArrowArrayStream* src, struct ArrowArrayStream* dst); /// \brief Call the get_schema callback of an ArrowArrayStream /// \ingroup nanoarrow-arrow-cdata @@ -1203,7 +1270,8 @@ static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, /// makes it significantly less verbose to iterate over array streams /// using NANOARROW_RETURN_NOT_OK()-style error handling. static inline ArrowErrorCode ArrowArrayStreamGetSchema( - struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowArrayStream* array_stream, + struct ArrowSchema* out, struct ArrowError* error); /// \brief Call the get_schema callback of an ArrowArrayStream @@ -1214,20 +1282,22 @@ static inline ArrowErrorCode ArrowArrayStreamGetSchema( /// makes it significantly less verbose to iterate over array streams /// using NANOARROW_RETURN_NOT_OK()-style error handling. static inline ArrowErrorCode ArrowArrayStreamGetNext( - struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowArrayStream* array_stream, + struct ArrowArray* out, struct ArrowError* error); /// \brief Call the get_next callback of an ArrowArrayStream /// \ingroup nanoarrow-arrow-cdata /// /// Unlike the get_next callback, this function never returns NULL (i.e., its -/// result is safe to use in printf-style error formatters). Null values from the -/// original callback are reported as "". +/// result is safe to use in printf-style error formatters). Null values from +/// the original callback are reported as "". static inline const char* ArrowArrayStreamGetLastError( struct ArrowArrayStream* array_stream); /// \brief Call the release callback of an ArrowArrayStream -static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream); +static inline void ArrowArrayStreamRelease( + struct ArrowArrayStream* array_stream); /// \defgroup nanoarrow-errors Error handling /// @@ -1235,24 +1305,24 @@ static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream /// need to communicate more verbose error information accept a pointer /// to an ArrowError. This can be stack or statically allocated. The /// content of the message is undefined unless an error code has been -/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the -/// ArrowError pointed to by the argument will be propagated with a +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, +/// the ArrowError pointed to by the argument will be propagated with a /// null-terminated error message. It is safe to pass a NULL ArrowError anywhere /// in the nanoarrow API. /// /// Except where documented, it is generally not safe to continue after a -/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and -/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use -/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms -/// for memory management and error propgagtion. +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK +/// and NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ +/// clients can use the helpers provided in the nanoarrow.hpp header to +/// facilitate using C++ idioms for memory management and error propgagtion. /// /// @{ /// \brief Set the contents of an error using printf syntax. /// /// If error is NULL, this function does nothing and returns NANOARROW_OK. -NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet(struct ArrowError* error, - const char* fmt, ...); +NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet( + struct ArrowError* error, const char* fmt, ...); /// @} @@ -1273,12 +1343,12 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); static inline struct ArrowStringView ArrowCharView(const char* value); /// \brief Sets the integer value of an ArrowDecimal from a string -ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, - struct ArrowStringView value); +ArrowErrorCode ArrowDecimalSetDigits( + struct ArrowDecimal* decimal, struct ArrowStringView value); /// \brief Get the integer value of an ArrowDecimal as string -ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, - struct ArrowBuffer* buffer); +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer( + const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer); /// @} @@ -1299,9 +1369,10 @@ void ArrowSchemaInit(struct ArrowSchema* schema); /// /// A convenience constructor for that calls ArrowSchemaInit() and /// ArrowSchemaSetType() for the common case of constructing an -/// unparameterized type. The caller is responsible for calling the schema->release -/// callback if NANOARROW_OK is returned. -ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); +/// unparameterized type. The caller is responsible for calling the +/// schema->release callback if NANOARROW_OK is returned. +ArrowErrorCode ArrowSchemaInitFromType( + struct ArrowSchema* schema, enum ArrowType type); /// \brief Get a human-readable summary of a Schema /// @@ -1309,8 +1380,8 @@ ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowTyp /// and returns the number of characters required for the output if /// n were sufficiently large. If recursive is non-zero, the result will /// also include children. -int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, - char recursive); +int64_t ArrowSchemaToString( + const struct ArrowSchema* schema, char* out, int64_t n, char recursive); /// \brief Set the format field of a schema from an ArrowType /// @@ -1318,16 +1389,19 @@ int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t /// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and /// NANOARROW_TYPE_MAP, the appropriate number of children are /// allocated, initialized, and named; however, the caller must -/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized -/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); +/// ArrowSchemaSetType() on the preinitialized children. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetType( + struct ArrowSchema* schema, enum ArrowType type); /// \brief Set the format field and initialize children of a struct schema /// -/// The specified number of children are initialized; however, the caller is responsible -/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. -/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); +/// The specified number of children are initialized; however, the caller is +/// responsible for calling ArrowSchemaSetType() and ArrowSchemaSetName() on +/// each child. Schema must have been initialized using ArrowSchemaInit() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeStruct( + struct ArrowSchema* schema, int64_t n_children); /// \brief Set the format field of a fixed-size schema /// @@ -1335,50 +1409,55 @@ ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_ch /// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. /// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are /// allocated, initialized, and named; however, the caller must -/// ArrowSchemaSetType() the first child. Schema must have been initialized using -/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, - enum ArrowType type, int32_t fixed_size); +/// ArrowSchemaSetType() the first child. Schema must have been initialized +/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeFixedSize( + struct ArrowSchema* schema, enum ArrowType type, int32_t fixed_size); /// \brief Set the format field of a decimal schema /// /// Returns EINVAL for scale <= 0 or for type that is not -/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been -/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, - int32_t decimal_precision, - int32_t decimal_scale); +/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have +/// been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDecimal( + struct ArrowSchema* schema, + enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale); /// \brief Set the format field of a time, timestamp, or duration schema /// /// Returns EINVAL for type that is not /// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, /// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The -/// timezone parameter must be NULL for a non-timestamp type. Schema must have been -/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, - enum ArrowTimeUnit time_unit, - const char* timezone); +/// timezone parameter must be NULL for a non-timestamp type. Schema must have +/// been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDateTime( + struct ArrowSchema* schema, + enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone); /// \brief Seet the format field of a union schema /// /// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION /// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are /// allocated, and initialized. -ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, - int64_t n_children); +ArrowErrorCode ArrowSchemaSetTypeUnion( + struct ArrowSchema* schema, enum ArrowType type, int64_t n_children); /// \brief Make a (recursive) copy of a schema /// /// Allocates and copies fields of schema into schema_out. -ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, - struct ArrowSchema* schema_out); +ArrowErrorCode ArrowSchemaDeepCopy( + const struct ArrowSchema* schema, struct ArrowSchema* schema_out); /// \brief Copy format into schema->format /// /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); +ArrowErrorCode ArrowSchemaSetFormat( + struct ArrowSchema* schema, const char* format); /// \brief Copy name into schema->name /// @@ -1390,15 +1469,16 @@ ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); /// /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy. -ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); +ArrowErrorCode ArrowSchemaSetMetadata( + struct ArrowSchema* schema, const char* metadata); /// \brief Allocate the schema->children array /// /// Includes the memory for each child struct ArrowSchema. /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, - int64_t n_children); +ArrowErrorCode ArrowSchemaAllocateChildren( + struct ArrowSchema* schema, int64_t n_children); /// \brief Allocate the schema->dictionary member /// @@ -1417,24 +1497,25 @@ ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); /// The ArrowMetadataReader does not own any data and is only valid /// for the lifetime of the underlying metadata pointer. struct ArrowMetadataReader { - /// \brief A metadata string from a schema->metadata field. - const char* metadata; + /// \brief A metadata string from a schema->metadata field. + const char* metadata; - /// \brief The current offset into the metadata string - int64_t offset; + /// \brief The current offset into the metadata string + int64_t offset; - /// \brief The number of remaining keys - int32_t remaining_keys; + /// \brief The number of remaining keys + int32_t remaining_keys; }; /// \brief Initialize an ArrowMetadataReader -ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, - const char* metadata); +ArrowErrorCode ArrowMetadataReaderInit( + struct ArrowMetadataReader* reader, const char* metadata); /// \brief Read the next key/value pair from an ArrowMetadataReader -ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, - struct ArrowStringView* key_out, - struct ArrowStringView* value_out); +ArrowErrorCode ArrowMetadataReaderRead( + struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out); /// \brief The number of bytes in in a key/value metadata string int64_t ArrowMetadataSizeOf(const char* metadata); @@ -1445,32 +1526,37 @@ char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); /// \brief Extract a value from schema metadata /// /// If key does not exist in metadata, value_out is unmodified -ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, - struct ArrowStringView* value_out); +ArrowErrorCode ArrowMetadataGetValue( + const char* metadata, + struct ArrowStringView key, + struct ArrowStringView* value_out); /// \brief Initialize a builder for schema metadata from key/value pairs /// /// metadata can be an existing metadata string or NULL to initialize /// an empty metadata string. -ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); +ArrowErrorCode ArrowMetadataBuilderInit( + struct ArrowBuffer* buffer, const char* metadata); /// \brief Append a key/value pair to a buffer containing serialized metadata -ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value); +ArrowErrorCode ArrowMetadataBuilderAppend( + struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); /// \brief Set a key/value pair to a buffer containing serialized metadata /// /// Ensures that the only entry for key in the metadata is set to value. /// This function maintains the existing position of (the first instance of) /// key if present in the data. -ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value); +ArrowErrorCode ArrowMetadataBuilderSet( + struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); /// \brief Remove a key from a buffer containing serialized metadata -ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, - struct ArrowStringView key); +ArrowErrorCode ArrowMetadataBuilderRemove( + struct ArrowBuffer* buffer, struct ArrowStringView key); /// @} @@ -1485,92 +1571,93 @@ ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, /// encouraged to use the provided getters to ensure forward /// compatibility. struct ArrowSchemaView { - /// \brief A pointer to the schema represented by this view - const struct ArrowSchema* schema; - - /// \brief The data type represented by the schema - /// - /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a - /// non-null dictionary member; datetime types are valid values. - /// This value will never be NANOARROW_TYPE_EXTENSION (see - /// extension_name and/or extension_metadata to check for - /// an extension type). - enum ArrowType type; - - /// \brief The storage data type represented by the schema - /// - /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION - /// or any datetime type. This value represents only the type required to - /// interpret the buffers in the array. - enum ArrowType storage_type; - - /// \brief The storage layout represented by the schema - struct ArrowLayout layout; - - /// \brief The extension type name if it exists - /// - /// If the ARROW:extension:name key is present in schema.metadata, - /// extension_name.data will be non-NULL. - struct ArrowStringView extension_name; - - /// \brief The extension type metadata if it exists - /// - /// If the ARROW:extension:metadata key is present in schema.metadata, - /// extension_metadata.data will be non-NULL. - struct ArrowStringView extension_metadata; - - /// \brief Format fixed size parameter - /// - /// This value is set when parsing a fixed-size binary or fixed-size - /// list schema; this value is undefined for other types. For a - /// fixed-size binary schema this value is in bytes; for a fixed-size - /// list schema this value refers to the number of child elements for - /// each element of the parent. - int32_t fixed_size; - - /// \brief Decimal bitwidth - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_bitwidth; - - /// \brief Decimal precision - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_precision; - - /// \brief Decimal scale - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_scale; - - /// \brief Format time unit parameter - /// - /// This value is set when parsing a date/time type. The value is - /// undefined for other types. - enum ArrowTimeUnit time_unit; - - /// \brief Format timezone parameter - /// - /// This value is set when parsing a timestamp type and represents - /// the timezone format parameter. This value points to - /// data within the schema and is undefined for other types. - const char* timezone; - - /// \brief Union type ids parameter - /// - /// This value is set when parsing a union type and represents - /// type ids parameter. This value points to - /// data within the schema and is undefined for other types. - const char* union_type_ids; + /// \brief A pointer to the schema represented by this view + const struct ArrowSchema* schema; + + /// \brief The data type represented by the schema + /// + /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a + /// non-null dictionary member; datetime types are valid values. + /// This value will never be NANOARROW_TYPE_EXTENSION (see + /// extension_name and/or extension_metadata to check for + /// an extension type). + enum ArrowType type; + + /// \brief The storage data type represented by the schema + /// + /// This value will never be NANOARROW_TYPE_DICTIONARY, + /// NANOARROW_TYPE_EXTENSION or any datetime type. This value represents + /// only the type required to interpret the buffers in the array. + enum ArrowType storage_type; + + /// \brief The storage layout represented by the schema + struct ArrowLayout layout; + + /// \brief The extension type name if it exists + /// + /// If the ARROW:extension:name key is present in schema.metadata, + /// extension_name.data will be non-NULL. + struct ArrowStringView extension_name; + + /// \brief The extension type metadata if it exists + /// + /// If the ARROW:extension:metadata key is present in schema.metadata, + /// extension_metadata.data will be non-NULL. + struct ArrowStringView extension_metadata; + + /// \brief Format fixed size parameter + /// + /// This value is set when parsing a fixed-size binary or fixed-size + /// list schema; this value is undefined for other types. For a + /// fixed-size binary schema this value is in bytes; for a fixed-size + /// list schema this value refers to the number of child elements for + /// each element of the parent. + int32_t fixed_size; + + /// \brief Decimal bitwidth + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_bitwidth; + + /// \brief Decimal precision + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_precision; + + /// \brief Decimal scale + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_scale; + + /// \brief Format time unit parameter + /// + /// This value is set when parsing a date/time type. The value is + /// undefined for other types. + enum ArrowTimeUnit time_unit; + + /// \brief Format timezone parameter + /// + /// This value is set when parsing a timestamp type and represents + /// the timezone format parameter. This value points to + /// data within the schema and is undefined for other types. + const char* timezone; + + /// \brief Union type ids parameter + /// + /// This value is set when parsing a union type and represents + /// type ids parameter. This value points to + /// data within the schema and is undefined for other types. + const char* union_type_ids; }; /// \brief Initialize an ArrowSchemaView -ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - const struct ArrowSchema* schema, - struct ArrowError* error); +ArrowErrorCode ArrowSchemaViewInit( + struct ArrowSchemaView* schema_view, + const struct ArrowSchema* schema, + struct ArrowError* error); /// @} @@ -1601,7 +1688,8 @@ static inline void ArrowBufferReset(struct ArrowBuffer* buffer); /// /// Transfers the buffer data and lifecycle management to another /// address and resets buffer. -static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); +static inline void ArrowBufferMove( + struct ArrowBuffer* src, struct ArrowBuffer* dst); /// \brief Grow or shrink a buffer to a given capacity /// @@ -1609,85 +1697,84 @@ static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* /// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not /// adjust the buffer's size member except to ensure that the invariant /// capacity >= size remains true. -static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, - char shrink_to_fit); +static inline ArrowErrorCode ArrowBufferResize( + struct ArrowBuffer* buffer, int64_t new_capacity_bytes, char shrink_to_fit); /// \brief Ensure a buffer has at least a given additional capacity /// /// Ensures that the buffer has space to append at least /// additional_size_bytes, overallocating when required. -static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, - int64_t additional_size_bytes); +static inline ArrowErrorCode ArrowBufferReserve( + struct ArrowBuffer* buffer, int64_t additional_size_bytes); /// \brief Write data to buffer and increment the buffer size /// /// This function does not check that buffer has the required capacity -static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, - int64_t size_bytes); +static inline void ArrowBufferAppendUnsafe( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes); /// \brief Write data to buffer and increment the buffer size /// /// This function writes and ensures that the buffer has the required capacity, /// possibly by reallocating the buffer. Like ArrowBufferReserve, this will /// overallocate when reallocation is required. -static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, - const void* data, int64_t size_bytes); +static inline ArrowErrorCode ArrowBufferAppend( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes); /// \brief Write fill to buffer and increment the buffer size /// /// This function writes the specified number of fill bytes and /// ensures that the buffer has the required capacity, -static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, - uint8_t value, int64_t size_bytes); +static inline ArrowErrorCode ArrowBufferAppendFill( + struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes); /// \brief Write an 8-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, - int8_t value); +static inline ArrowErrorCode ArrowBufferAppendInt8( + struct ArrowBuffer* buffer, int8_t value); /// \brief Write an unsigned 8-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, - uint8_t value); +static inline ArrowErrorCode ArrowBufferAppendUInt8( + struct ArrowBuffer* buffer, uint8_t value); /// \brief Write a 16-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, - int16_t value); +static inline ArrowErrorCode ArrowBufferAppendInt16( + struct ArrowBuffer* buffer, int16_t value); /// \brief Write an unsigned 16-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, - uint16_t value); +static inline ArrowErrorCode ArrowBufferAppendUInt16( + struct ArrowBuffer* buffer, uint16_t value); /// \brief Write a 32-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, - int32_t value); +static inline ArrowErrorCode ArrowBufferAppendInt32( + struct ArrowBuffer* buffer, int32_t value); /// \brief Write an unsigned 32-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, - uint32_t value); +static inline ArrowErrorCode ArrowBufferAppendUInt32( + struct ArrowBuffer* buffer, uint32_t value); /// \brief Write a 64-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, - int64_t value); +static inline ArrowErrorCode ArrowBufferAppendInt64( + struct ArrowBuffer* buffer, int64_t value); /// \brief Write an unsigned 64-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, - uint64_t value); +static inline ArrowErrorCode ArrowBufferAppendUInt64( + struct ArrowBuffer* buffer, uint64_t value); /// \brief Write a double to a buffer -static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, - double value); +static inline ArrowErrorCode ArrowBufferAppendDouble( + struct ArrowBuffer* buffer, double value); /// \brief Write a float to a buffer -static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, - float value); +static inline ArrowErrorCode ArrowBufferAppendFloat( + struct ArrowBuffer* buffer, float value); /// \brief Write an ArrowStringView to a buffer -static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, - struct ArrowStringView value); +static inline ArrowErrorCode ArrowBufferAppendStringView( + struct ArrowBuffer* buffer, struct ArrowStringView value); /// \brief Write an ArrowBufferView to a buffer -static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, - struct ArrowBufferView value); +static inline ArrowErrorCode ArrowBufferAppendBufferView( + struct ArrowBuffer* buffer, struct ArrowBufferView value); /// @} @@ -1708,19 +1795,20 @@ static inline void ArrowBitClear(uint8_t* bits, int64_t i); static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); /// \brief Set a boolean value to a range in a bitmap -static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, - uint8_t bits_are_set); +static inline void ArrowBitsSetTo( + uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set); /// \brief Count true values in a bitmap -static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); +static inline int64_t ArrowBitCountSet( + const uint8_t* bits, int64_t i_from, int64_t i_to); /// \brief Extract int8 boolean values from a range in a bitmap -static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, - int64_t length, int8_t* out); +static inline void ArrowBitsUnpackInt8( + const uint8_t* bits, int64_t start_offset, int64_t length, int8_t* out); /// \brief Extract int32 boolean values from a range in a bitmap -static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, - int64_t length, int32_t* out); +static inline void ArrowBitsUnpackInt32( + const uint8_t* bits, int64_t start_offset, int64_t length, int32_t* out); /// \brief Initialize an ArrowBitmap /// @@ -1731,14 +1819,15 @@ static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); /// /// Transfers the underlying buffer data and lifecycle management to another /// address and resets the bitmap. -static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); +static inline void ArrowBitmapMove( + struct ArrowBitmap* src, struct ArrowBitmap* dst); /// \brief Ensure a bitmap builder has at least a given additional capacity /// /// Ensures that the buffer has space to append at least /// additional_size_bits, overallocating when required. -static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, - int64_t additional_size_bits); +static inline ArrowErrorCode ArrowBitmapReserve( + struct ArrowBitmap* bitmap, int64_t additional_size_bits); /// \brief Grow or shrink a bitmap to a given capacity /// @@ -1746,33 +1835,34 @@ static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, /// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not /// adjust the buffer's size member except when shrinking new_capacity_bits /// to a value less than the current number of bits in the bitmap. -static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit); +static inline ArrowErrorCode ArrowBitmapResize( + struct ArrowBitmap* bitmap, int64_t new_capacity_bits, char shrink_to_fit); -/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap -static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length); +/// \brief Reserve space for and append zero or more of the same boolean value +/// to a bitmap +static inline ArrowErrorCode ArrowBitmapAppend( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length); /// \brief Append zero or more of the same boolean value to a bitmap -static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length); +static inline void ArrowBitmapAppendUnsafe( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length); /// \brief Append boolean values encoded as int8_t to a bitmap /// /// The values must all be 0 or 1. -static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - const int8_t* values, int64_t n_values); +static inline void ArrowBitmapAppendInt8Unsafe( + struct ArrowBitmap* bitmap, const int8_t* values, int64_t n_values); /// \brief Append boolean values encoded as int32_t to a bitmap /// /// The values must all be 0 or 1. -static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, - const int32_t* values, int64_t n_values); +static inline void ArrowBitmapAppendInt32Unsafe( + struct ArrowBitmap* bitmap, const int32_t* values, int64_t n_values); /// \brief Reset a bitmap builder /// -/// Releases any memory held by buffer, empties the cache, and resets the size to zero +/// Releases any memory held by buffer, empties the cache, and resets the size +/// to zero static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); /// @} @@ -1791,24 +1881,26 @@ static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); /// Initializes the fields and release callback of array. Caller /// is responsible for calling the array->release callback if /// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, - enum ArrowType storage_type); +ArrowErrorCode ArrowArrayInitFromType( + struct ArrowArray* array, enum ArrowType storage_type); /// \brief Initialize the contents of an ArrowArray from an ArrowSchema /// /// Caller is responsible for calling the array->release callback if /// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - const struct ArrowSchema* schema, - struct ArrowError* error); +ArrowErrorCode ArrowArrayInitFromSchema( + struct ArrowArray* array, + const struct ArrowSchema* schema, + struct ArrowError* error); /// \brief Initialize the contents of an ArrowArray from an ArrowArrayView /// /// Caller is responsible for calling the array->release callback if /// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, - const struct ArrowArrayView* array_view, - struct ArrowError* error); +ArrowErrorCode ArrowArrayInitFromArrayView( + struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error); /// \brief Allocate the array->children array /// @@ -1816,7 +1908,8 @@ ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, /// whose members are marked as released and may be subsequently initialized /// with ArrowArrayInitFromType() or moved from an existing ArrowArray. /// schema must have been allocated using ArrowArrayInitFromType(). -ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); +ArrowErrorCode ArrowArrayAllocateChildren( + struct ArrowArray* array, int64_t n_children); /// \brief Allocate the array->dictionary member /// @@ -1829,30 +1922,33 @@ ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); /// \brief Set the validity bitmap of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() -void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); +void ArrowArraySetValidityBitmap( + struct ArrowArray* array, struct ArrowBitmap* bitmap); /// \brief Set a buffer of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, - struct ArrowBuffer* buffer); +ArrowErrorCode ArrowArraySetBuffer( + struct ArrowArray* array, int64_t i, struct ArrowBuffer* buffer); /// \brief Get the validity bitmap of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() -static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); +static inline struct ArrowBitmap* ArrowArrayValidityBitmap( + struct ArrowArray* array); /// \brief Get a buffer of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() -static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); +static inline struct ArrowBuffer* ArrowArrayBuffer( + struct ArrowArray* array, int64_t i); /// \brief Start element-wise appending to an ArrowArray /// /// Initializes any values needed to use ArrowArrayAppend*() functions. -/// All element-wise appenders append by value and return EINVAL if the exact value -/// cannot be represented by the underlying storage type. -/// array must have been allocated using ArrowArrayInitFromType() +/// All element-wise appenders append by value and return EINVAL if the exact +/// value cannot be represented by the underlying storage type. array must have +/// been allocated using ArrowArrayInitFromType() static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); /// \brief Reserve space for future appends @@ -1861,29 +1957,32 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); /// child array sizes for non-fixed-size arrays), recursively reserve space for /// additional elements. This is useful for reducing the number of reallocations /// that occur using the item-wise appenders. -ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, - int64_t additional_size_elements); +ArrowErrorCode ArrowArrayReserve( + struct ArrowArray* array, int64_t additional_size_elements); /// \brief Append a null value to an array -static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); +static inline ArrowErrorCode ArrowArrayAppendNull( + struct ArrowArray* array, int64_t n); /// \brief Append an empty, non-null value to an array -static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); +static inline ArrowErrorCode ArrowArrayAppendEmpty( + struct ArrowArray* array, int64_t n); /// \brief Append a signed integer value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise (e.g., value /// is outside the valid array range). -static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); +static inline ArrowErrorCode ArrowArrayAppendInt( + struct ArrowArray* array, int64_t value); /// \brief Append an unsigned integer value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise (e.g., value /// is outside the valid array range). -static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, - uint64_t value); +static inline ArrowErrorCode ArrowArrayAppendUInt( + struct ArrowArray* array, uint64_t value); /// \brief Append a double value to an array /// @@ -1891,67 +1990,68 @@ static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, /// the underlying storage type or EINVAL otherwise (e.g., value /// is outside the valid array range or there is an attempt to append /// a non-integer to an array with an integer storage type). -static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, - double value); +static inline ArrowErrorCode ArrowArrayAppendDouble( + struct ArrowArray* array, double value); /// \brief Append a string of bytes to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type, EOVERFLOW if appending value would overflow /// the offset type (e.g., if the data buffer would be larger than 2 GB for a -/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a -/// binary, string, large binary, large string, or fixed-size binary array, or value is -/// the wrong size for a fixed-size binary array). -static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, - struct ArrowBufferView value); +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is +/// not a binary, string, large binary, large string, or fixed-size binary +/// array, or value is the wrong size for a fixed-size binary array). +static inline ArrowErrorCode ArrowArrayAppendBytes( + struct ArrowArray* array, struct ArrowBufferView value); /// \brief Append a string value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type, EOVERFLOW if appending value would overflow /// the offset type (e.g., if the data buffer would be larger than 2 GB for a -/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a -/// string or large string array). -static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, - struct ArrowStringView value); +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is +/// not a string or large string array). +static inline ArrowErrorCode ArrowArrayAppendString( + struct ArrowArray* array, struct ArrowStringView value); /// \brief Append a Interval to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise. -static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, - const struct ArrowInterval* value); +static inline ArrowErrorCode ArrowArrayAppendInterval( + struct ArrowArray* array, const struct ArrowInterval* value); /// \brief Append a decimal value to an array /// /// Returns NANOARROW_OK if array is a decimal array with the appropriate /// bitwidth or EINVAL otherwise. -static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - const struct ArrowDecimal* value); +static inline ArrowErrorCode ArrowArrayAppendDecimal( + struct ArrowArray* array, const struct ArrowDecimal* value); /// \brief Finish a nested array element /// /// Appends a non-null element to the array based on the first child's current /// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW /// if the child of a list or map array would exceed INT_MAX elements, or EINVAL -/// if the underlying storage type is not a struct, list, large list, or fixed-size -/// list, or if there was an attempt to add a struct or fixed-size list element where the -/// length of the child array(s) did not match the expected length. +/// if the underlying storage type is not a struct, list, large list, or +/// fixed-size list, or if there was an attempt to add a struct or fixed-size +/// list element where the length of the child array(s) did not match the +/// expected length. static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); /// \brief Finish a union array element /// -/// Appends an element to the union type ids buffer and increments array->length. -/// For sparse unions, up to one element is added to non type-id children. Returns -/// EINVAL if the underlying storage type is not a union, if type_id is not valid, -/// or if child sizes after appending are inconsistent. -static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, - int8_t type_id); +/// Appends an element to the union type ids buffer and increments +/// array->length. For sparse unions, up to one element is added to non type-id +/// children. Returns EINVAL if the underlying storage type is not a union, if +/// type_id is not valid, or if child sizes after appending are inconsistent. +static inline ArrowErrorCode ArrowArrayFinishUnionElement( + struct ArrowArray* array, int8_t type_id); /// \brief Shrink buffer capacity to the size required /// -/// Also applies shrinking to any child arrays. array must have been allocated using -/// ArrowArrayInitFromType +/// Also applies shrinking to any child arrays. array must have been allocated +/// using ArrowArrayInitFromType static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); /// \brief Finish building an ArrowArray @@ -1960,19 +2060,20 @@ static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); /// into array->buffers and checks the actual size of the buffers /// against the expected size based on the final length. /// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, - struct ArrowError* error); +ArrowErrorCode ArrowArrayFinishBuildingDefault( + struct ArrowArray* array, struct ArrowError* error); /// \brief Finish building an ArrowArray with explicit validation /// -/// Finish building with an explicit validation level. This could perform less validation -/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU -/// buffer data access is not possible or more validation (i.e., -/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or -/// corruptible source. -ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, - enum ArrowValidationLevel validation_level, - struct ArrowError* error); +/// Finish building with an explicit validation level. This could perform less +/// validation (i.e. NANOARROW_VALIDATION_LEVEL_NONE or +/// NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU buffer data access is not +/// possible or more validation (i.e., NANOARROW_VALIDATION_LEVEL_FULL) if +/// buffer content was obtained from an untrusted or corruptible source. +ArrowErrorCode ArrowArrayFinishBuilding( + struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); /// @} @@ -1983,66 +2084,71 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, /// @{ /// \brief Initialize the contents of an ArrowArrayView -void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, - enum ArrowType storage_type); +void ArrowArrayViewInitFromType( + struct ArrowArrayView* array_view, enum ArrowType storage_type); /// \brief Move an ArrowArrayView /// /// Transfers the ArrowArrayView data and lifecycle management to another /// address and resets the contents of src. -static inline void ArrowArrayViewMove(struct ArrowArrayView* src, - struct ArrowArrayView* dst); +static inline void ArrowArrayViewMove( + struct ArrowArrayView* src, struct ArrowArrayView* dst); /// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema -ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - const struct ArrowSchema* schema, - struct ArrowError* error); +ArrowErrorCode ArrowArrayViewInitFromSchema( + struct ArrowArrayView* array_view, + const struct ArrowSchema* schema, + struct ArrowError* error); /// \brief Allocate the array_view->children array /// /// Includes the memory for each child struct ArrowArrayView -ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, - int64_t n_children); +ArrowErrorCode ArrowArrayViewAllocateChildren( + struct ArrowArrayView* array_view, int64_t n_children); /// \brief Allocate array_view->dictionary -ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); +ArrowErrorCode ArrowArrayViewAllocateDictionary( + struct ArrowArrayView* array_view); /// \brief Set data-independent buffer sizes from length void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); /// \brief Set buffer sizes and data pointers from an ArrowArray -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error); +ArrowErrorCode ArrowArrayViewSetArray( + struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); -/// \brief Set buffer sizes and data pointers from an ArrowArray except for those -/// that require dereferencing buffer content. -ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error); +/// \brief Set buffer sizes and data pointers from an ArrowArray except for +/// those that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal( + struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); /// \brief Performs checks on the content of an ArrowArrayView /// /// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, /// the buffer sizes and some content (fist and last offset) have already /// been validated at the "default" level. If setting the buffer pointers -/// and sizes otherwise, you may wish to perform checks at a different level. See -/// documentation for ArrowValidationLevel for the details of checks performed -/// at each level. -ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, - enum ArrowValidationLevel validation_level, - struct ArrowError* error); +/// and sizes otherwise, you may wish to perform checks at a different level. +/// See documentation for ArrowValidationLevel for the details of checks +/// performed at each level. +ArrowErrorCode ArrowArrayViewValidate( + struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); /// \brief Reset the contents of an ArrowArrayView and frees resources void ArrowArrayViewReset(struct ArrowArrayView* array_view); /// \brief Check for a null element in an ArrowArrayView -static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, - int64_t i); +static inline int8_t ArrowArrayViewIsNull( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the type id of a union array element -static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, - int64_t i); +static inline int8_t ArrowArrayViewUnionTypeId( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the child index of a union array element static inline int8_t ArrowArrayViewUnionChildIndex( @@ -2054,15 +2160,15 @@ static inline int64_t ArrowArrayViewUnionChildOffset( /// \brief Get an element in an ArrowArrayView as an integer /// -/// This function does not check for null values, that values are actually integers, or -/// that values are within a valid range for an int64. -static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, - int64_t i); +/// This function does not check for null values, that values are actually +/// integers, or that values are within a valid range for an int64. +static inline int64_t ArrowArrayViewGetIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an unsigned integer /// -/// This function does not check for null values, that values are actually integers, or -/// that values are within a valid range for a uint64. +/// This function does not check for null values, that values are actually +/// integers, or that values are within a valid range for a uint64. static inline uint64_t ArrowArrayViewGetUIntUnsafe( const struct ArrowArrayView* array_view, int64_t i); @@ -2090,8 +2196,10 @@ static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( /// This function does not check for null values. The out parameter must /// be initialized with ArrowDecimalInit() with the proper parameters for this /// type before calling this for the first time. -static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, - int64_t i, struct ArrowDecimal* out); +static inline void ArrowArrayViewGetDecimalUnsafe( + const struct ArrowArrayView* array_view, + int64_t i, + struct ArrowDecimal* out); /// @} @@ -2109,8 +2217,10 @@ static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* a /// This function moves the ownership of schema to the array_stream. If /// this function returns NANOARROW_OK, the caller is responsible for /// releasing the ArrowArrayStream. -ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, - struct ArrowSchema* schema, int64_t n_arrays); +ArrowErrorCode ArrowBasicArrayStreamInit( + struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, + int64_t n_arrays); /// \brief Set the ith ArrowArray in this ArrowArrayStream. /// @@ -2119,29 +2229,27 @@ ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, /// be greater than zero and less than the value of n_arrays passed in /// ArrowBasicArrayStreamInit(). Callers are not required to fill all /// n_arrays members (i.e., n_arrays is a maximum bound). -void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, - struct ArrowArray* array); +void ArrowBasicArrayStreamSetArray( + struct ArrowArrayStream* array_stream, int64_t i, struct ArrowArray* array); /// \brief Validate the contents of this ArrowArrayStream /// /// array_stream must have been initialized with ArrowBasicArrayStreamInit(). -/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() -/// to validate the contents of the arrays. -ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, - struct ArrowError* error); +/// This function uses ArrowArrayStreamInitFromSchema() and +/// ArrowArrayStreamSetArray() to validate the contents of the arrays. +ArrowErrorCode ArrowBasicArrayStreamValidate( + const struct ArrowArrayStream* array_stream, struct ArrowError* error); /// @} -// Undefine ArrowErrorCode, which may have been defined to annotate functions that return -// it to warn for an unused result. +// Undefine ArrowErrorCode, which may have been defined to annotate functions +// that return it to warn for an unused result. #if defined(ArrowErrorCode) #undef ArrowErrorCode #endif // Inline function definitions - - #ifdef __cplusplus } #endif @@ -2171,574 +2279,599 @@ ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* arra #include #include - - #ifdef __cplusplus extern "C" { #endif -static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { - int64_t doubled_capacity = current_capacity * 2; - if (doubled_capacity > new_capacity) { - return doubled_capacity; - } else { - return new_capacity; - } +static inline int64_t _ArrowGrowByFactor( + int64_t current_capacity, int64_t new_capacity) { + int64_t doubled_capacity = current_capacity * 2; + if (doubled_capacity > new_capacity) { + return doubled_capacity; + } else { + return new_capacity; + } } static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { - buffer->data = NULL; - buffer->size_bytes = 0; - buffer->capacity_bytes = 0; - buffer->allocator = ArrowBufferAllocatorDefault(); + buffer->data = NULL; + buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->allocator = ArrowBufferAllocatorDefault(); } static inline ArrowErrorCode ArrowBufferSetAllocator( struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { - if (buffer->data == NULL) { - buffer->allocator = allocator; - return NANOARROW_OK; - } else { - return EINVAL; - } + if (buffer->data == NULL) { + buffer->allocator = allocator; + return NANOARROW_OK; + } else { + return EINVAL; + } } static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { - if (buffer->data != NULL) { - buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, - buffer->capacity_bytes); - buffer->data = NULL; - } + if (buffer->data != NULL) { + buffer->allocator.free( + &buffer->allocator, (uint8_t*)buffer->data, buffer->capacity_bytes); + buffer->data = NULL; + } - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; } -static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { - memcpy(dst, src, sizeof(struct ArrowBuffer)); - src->data = NULL; - ArrowBufferReset(src); +static inline void ArrowBufferMove( + struct ArrowBuffer* src, struct ArrowBuffer* dst) { + memcpy(dst, src, sizeof(struct ArrowBuffer)); + src->data = NULL; + ArrowBufferReset(src); } -static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, - char shrink_to_fit) { - if (new_capacity_bytes < 0) { - return EINVAL; - } - - if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { - buffer->data = buffer->allocator.reallocate( - &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); - if (buffer->data == NULL && new_capacity_bytes > 0) { - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; - return ENOMEM; +static inline ArrowErrorCode ArrowBufferResize( + struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit) { + if (new_capacity_bytes < 0) { + return EINVAL; } - buffer->capacity_bytes = new_capacity_bytes; - } + if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { + buffer->data = buffer->allocator.reallocate( + &buffer->allocator, + buffer->data, + buffer->capacity_bytes, + new_capacity_bytes); + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } - // Ensures that when shrinking that size <= capacity - if (new_capacity_bytes < buffer->size_bytes) { - buffer->size_bytes = new_capacity_bytes; - } + buffer->capacity_bytes = new_capacity_bytes; + } - return NANOARROW_OK; -} + // Ensures that when shrinking that size <= capacity + if (new_capacity_bytes < buffer->size_bytes) { + buffer->size_bytes = new_capacity_bytes; + } -static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, - int64_t additional_size_bytes) { - int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; - if (min_capacity_bytes <= buffer->capacity_bytes) { return NANOARROW_OK; - } +} + +static inline ArrowErrorCode ArrowBufferReserve( + struct ArrowBuffer* buffer, int64_t additional_size_bytes) { + int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; + if (min_capacity_bytes <= buffer->capacity_bytes) { + return NANOARROW_OK; + } - return ArrowBufferResize( - buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); + return ArrowBufferResize( + buffer, + _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), + 0); } -static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, - int64_t size_bytes) { - if (size_bytes > 0) { - memcpy(buffer->data + buffer->size_bytes, data, size_bytes); - buffer->size_bytes += size_bytes; - } +static inline void ArrowBufferAppendUnsafe( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { + if (size_bytes > 0) { + memcpy(buffer->data + buffer->size_bytes, data, size_bytes); + buffer->size_bytes += size_bytes; + } } -static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, - const void* data, int64_t size_bytes) { - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); +static inline ArrowErrorCode ArrowBufferAppend( + struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); - ArrowBufferAppendUnsafe(buffer, data, size_bytes); - return NANOARROW_OK; + ArrowBufferAppendUnsafe(buffer, data, size_bytes); + return NANOARROW_OK; } -static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, - int8_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); +static inline ArrowErrorCode ArrowBufferAppendInt8( + struct ArrowBuffer* buffer, int8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); } -static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, - uint8_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); +static inline ArrowErrorCode ArrowBufferAppendUInt8( + struct ArrowBuffer* buffer, uint8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); } -static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, - int16_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); +static inline ArrowErrorCode ArrowBufferAppendInt16( + struct ArrowBuffer* buffer, int16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); } -static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, - uint16_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); +static inline ArrowErrorCode ArrowBufferAppendUInt16( + struct ArrowBuffer* buffer, uint16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); } -static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, - int32_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); +static inline ArrowErrorCode ArrowBufferAppendInt32( + struct ArrowBuffer* buffer, int32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); } -static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, - uint32_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); +static inline ArrowErrorCode ArrowBufferAppendUInt32( + struct ArrowBuffer* buffer, uint32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); } -static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, - int64_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); +static inline ArrowErrorCode ArrowBufferAppendInt64( + struct ArrowBuffer* buffer, int64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); } -static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, - uint64_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); +static inline ArrowErrorCode ArrowBufferAppendUInt64( + struct ArrowBuffer* buffer, uint64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); } -static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, - double value) { - return ArrowBufferAppend(buffer, &value, sizeof(double)); +static inline ArrowErrorCode ArrowBufferAppendDouble( + struct ArrowBuffer* buffer, double value) { + return ArrowBufferAppend(buffer, &value, sizeof(double)); } -static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, - float value) { - return ArrowBufferAppend(buffer, &value, sizeof(float)); +static inline ArrowErrorCode ArrowBufferAppendFloat( + struct ArrowBuffer* buffer, float value) { + return ArrowBufferAppend(buffer, &value, sizeof(float)); } -static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, - struct ArrowStringView value) { - return ArrowBufferAppend(buffer, value.data, value.size_bytes); +static inline ArrowErrorCode ArrowBufferAppendStringView( + struct ArrowBuffer* buffer, struct ArrowStringView value) { + return ArrowBufferAppend(buffer, value.data, value.size_bytes); } -static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, - struct ArrowBufferView value) { - return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); +static inline ArrowErrorCode ArrowBufferAppendBufferView( + struct ArrowBuffer* buffer, struct ArrowBufferView value) { + return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); } -static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, - uint8_t value, int64_t size_bytes) { - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); +static inline ArrowErrorCode ArrowBufferAppendFill( + struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); - memset(buffer->data + buffer->size_bytes, value, size_bytes); - buffer->size_bytes += size_bytes; - return NANOARROW_OK; + memset(buffer->data + buffer->size_bytes, value, size_bytes); + buffer->size_bytes += size_bytes; + return NANOARROW_OK; } static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; -static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; +static const uint8_t _ArrowkFlippedBitmask[] = { + 254, 253, 251, 247, 239, 223, 191, 127}; static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; -static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; +static const uint8_t _ArrowkTrailingBitmask[] = { + 255, 254, 252, 248, 240, 224, 192, 128}; static const uint8_t _ArrowkBytePopcount[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, - 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, - 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, - 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, - 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, - 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, - 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, - 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { - return (value + 7) & ~((int64_t)7); + return (value + 7) & ~((int64_t)7); } static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { - return (value / 8) * 8; + return (value / 8) * 8; } static inline int64_t _ArrowBytesForBits(int64_t bits) { - return (bits >> 3) + ((bits & 7) != 0); + return (bits >> 3) + ((bits & 7) != 0); } static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { - out[0] = (word & 0x1) != 0; - out[1] = (word & 0x2) != 0; - out[2] = (word & 0x4) != 0; - out[3] = (word & 0x8) != 0; - out[4] = (word & 0x10) != 0; - out[5] = (word & 0x20) != 0; - out[6] = (word & 0x40) != 0; - out[7] = (word & 0x80) != 0; + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; } static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - out[0] = (word & 0x1) != 0; - out[1] = (word & 0x2) != 0; - out[2] = (word & 0x4) != 0; - out[3] = (word & 0x8) != 0; - out[4] = (word & 0x10) != 0; - out[5] = (word & 0x20) != 0; - out[6] = (word & 0x40) != 0; - out[7] = (word & 0x80) != 0; + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; } static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { - *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | + ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | + ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | + ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); } static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { - *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | + ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | + ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | + ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); } static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { - return (bits[i >> 3] >> (i & 0x07)) & 1; + return (bits[i >> 3] >> (i & 0x07)) & 1; } -static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, - int64_t length, int8_t* out) { - if (length == 0) { - return; - } +static inline void ArrowBitsUnpackInt8( + const uint8_t* bits, int64_t start_offset, int64_t length, int8_t* out) { + if (length == 0) { + return; + } - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const int64_t i_last_valid = i_end - 1; + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_last_valid = i_last_valid / 8; + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; - if (bytes_begin == bytes_last_valid) { - for (int i = 0; i < length; i++) { - out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } - return; - } + return; + } - // first byte - for (int i = 0; i < 8 - (i_begin % 8); i++) { - *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } - // middle bytes - for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { - _ArrowBitsUnpackInt8(bits[i], out); - out += 8; - } + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } - // last byte - const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); - for (int i = 0; i < bits_remaining; i++) { - *out++ = ArrowBitGet(&bits[bytes_last_valid], i); - } + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } } -static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, - int64_t length, int32_t* out) { - if (length == 0) { - return; - } +static inline void ArrowBitsUnpackInt32( + const uint8_t* bits, int64_t start_offset, int64_t length, int32_t* out) { + if (length == 0) { + return; + } - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const int64_t i_last_valid = i_end - 1; + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_last_valid = i_last_valid / 8; + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; - if (bytes_begin == bytes_last_valid) { - for (int i = 0; i < length; i++) { - out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } - return; - } + return; + } - // first byte - for (int i = 0; i < 8 - (i_begin % 8); i++) { - *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } - // middle bytes - for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { - _ArrowBitsUnpackInt32(bits[i], out); - out += 8; - } + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt32(bits[i], out); + out += 8; + } - // last byte - const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); - for (int i = 0; i < bits_remaining; i++) { - *out++ = ArrowBitGet(&bits[bytes_last_valid], i); - } + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } } static inline void ArrowBitSet(uint8_t* bits, int64_t i) { - bits[i / 8] |= _ArrowkBitmask[i % 8]; + bits[i / 8] |= _ArrowkBitmask[i % 8]; } static inline void ArrowBitClear(uint8_t* bits, int64_t i) { - bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; + bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; } static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { - bits[i / 8] ^= - ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; -} - -static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, - uint8_t bits_are_set) { - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const uint8_t fill_byte = (uint8_t)(-bits_are_set); - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_end = i_end / 8 + 1; - - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; - - if (bytes_end == bytes_begin + 1) { - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); - bits[bytes_begin] &= only_byte_mask; - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } + bits[i / 8] ^= ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & + _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitsSetTo( + uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set) { + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const uint8_t fill_byte = (uint8_t)(-bits_are_set); + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_end = i_end / 8 + 1; + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const uint8_t only_byte_mask = i_end % 8 == 0 ? + first_byte_mask : + (uint8_t)(first_byte_mask | + last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } - // set/clear trailing bits of first byte - bits[bytes_begin] &= first_byte_mask; - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - if (bytes_end - bytes_begin > 2) { - // set/clear whole bytes - memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); - } + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + memset( + bits + bytes_begin + 1, + fill_byte, + (size_t)(bytes_end - bytes_begin - 2)); + } - if (i_end % 8 == 0) { - return; - } + if (i_end % 8 == 0) { + return; + } - // set/clear leading bits of last byte - bits[bytes_end - 1] &= last_byte_mask; - bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); } -static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, - int64_t length) { - if (length == 0) { - return 0; - } +static inline int64_t ArrowBitCountSet( + const uint8_t* bits, int64_t start_offset, int64_t length) { + if (length == 0) { + return 0; + } - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const int64_t i_last_valid = i_end - 1; + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_last_valid = i_last_valid / 8; + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; - if (bytes_begin == bytes_last_valid) { - // count bits within a single byte - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; + if (bytes_begin == bytes_last_valid) { + // count bits within a single byte + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; - const uint8_t only_byte_mask = - i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + const uint8_t only_byte_mask = i_end % 8 == 0 ? + last_byte_mask : + (uint8_t)(first_byte_mask & + last_byte_mask); - const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; - return _ArrowkBytePopcount[byte_masked]; - } + const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; + return _ArrowkBytePopcount[byte_masked]; + } - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; - int64_t count = 0; + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? + 0 : + _ArrowkTrailingBitmask[i_end % 8]; + int64_t count = 0; - // first byte - count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; + // first byte + count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; - // middle bytes - for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { - count += _ArrowkBytePopcount[bits[i]]; - } + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + count += _ArrowkBytePopcount[bits[i]]; + } - // last byte - count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; + // last byte + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; - return count; + return count; } static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { - ArrowBufferInit(&bitmap->buffer); - bitmap->size_bits = 0; + ArrowBufferInit(&bitmap->buffer); + bitmap->size_bits = 0; } -static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { - ArrowBufferMove(&src->buffer, &dst->buffer); - dst->size_bits = src->size_bits; - src->size_bits = 0; +static inline void ArrowBitmapMove( + struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBufferMove(&src->buffer, &dst->buffer); + dst->size_bits = src->size_bits; + src->size_bits = 0; } -static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, - int64_t additional_size_bits) { - int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; - if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { - return NANOARROW_OK; - } +static inline ArrowErrorCode ArrowBitmapReserve( + struct ArrowBitmap* bitmap, int64_t additional_size_bits) { + int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; + if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { + return NANOARROW_OK; + } - NANOARROW_RETURN_NOT_OK( - ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( + &bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); - bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; - return NANOARROW_OK; + bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; + return NANOARROW_OK; } -static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit) { - if (new_capacity_bits < 0) { - return EINVAL; - } +static inline ArrowErrorCode ArrowBitmapResize( + struct ArrowBitmap* bitmap, int64_t new_capacity_bits, char shrink_to_fit) { + if (new_capacity_bits < 0) { + return EINVAL; + } - int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); - NANOARROW_RETURN_NOT_OK( - ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); + int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); - if (new_capacity_bits < bitmap->size_bits) { - bitmap->size_bits = new_capacity_bits; - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); - - ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); - return NANOARROW_OK; -} + if (new_capacity_bits < bitmap->size_bits) { + bitmap->size_bits = new_capacity_bits; + } -static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length) { - ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); - bitmap->size_bits += length; - bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); + return NANOARROW_OK; } -static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - const int8_t* values, int64_t n_values) { - if (n_values == 0) { - return; - } +static inline ArrowErrorCode ArrowBitmapAppend( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); - const int8_t* values_cursor = values; - int64_t n_remaining = n_values; - int64_t out_i_cursor = bitmap->size_bits; - uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); + return NANOARROW_OK; +} + +static inline void ArrowBitmapAppendUnsafe( + struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length) { + ArrowBitsSetTo( + bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); + bitmap->size_bits += length; + bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); +} - // First byte - if ((out_i_cursor % 8) != 0) { - int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; - for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); +static inline void ArrowBitmapAppendInt8Unsafe( + struct ArrowBitmap* bitmap, const int8_t* values, int64_t n_values) { + if (n_values == 0) { + return; } - out_cursor++; - values_cursor += n_partial_bits; - n_remaining -= n_partial_bits; - } + const int8_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - + out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } - // Middle bytes - int64_t n_full_bytes = n_remaining / 8; - for (int64_t i = 0; i < n_full_bytes; i++) { - _ArrowBitmapPackInt8(values_cursor, out_cursor); - values_cursor += 8; - out_cursor++; - } + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } - // Last byte - out_i_cursor += n_full_bytes * 8; - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte - *out_cursor = 0x00; - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt8(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; } - out_cursor++; - } - bitmap->size_bits += n_values; - bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo( + bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; } -static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, - const int32_t* values, int64_t n_values) { - if (n_values == 0) { - return; - } +static inline void ArrowBitmapAppendInt32Unsafe( + struct ArrowBitmap* bitmap, const int32_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } - const int32_t* values_cursor = values; - int64_t n_remaining = n_values; - int64_t out_i_cursor = bitmap->size_bits; - uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + const int32_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - + out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo( + bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); + } - // First byte - if ((out_i_cursor % 8) != 0) { - int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; - for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; } - out_cursor++; - values_cursor += n_partial_bits; - n_remaining -= n_partial_bits; - } - - // Middle bytes - int64_t n_full_bytes = n_remaining / 8; - for (int64_t i = 0; i < n_full_bytes; i++) { - _ArrowBitmapPackInt32(values_cursor, out_cursor); - values_cursor += 8; - out_cursor++; - } + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt32(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } - // Last byte - out_i_cursor += n_full_bytes * 8; - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte - *out_cursor = 0x00; - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); - } - out_cursor++; - } - - bitmap->size_bits += n_values; - bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo( + bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; } static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { - ArrowBufferReset(&bitmap->buffer); - bitmap->size_bits = 0; + ArrowBufferReset(&bitmap->buffer); + bitmap->size_bits = 0; } #ifdef __cplusplus @@ -2772,961 +2905,1026 @@ static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { #include #include - - - #ifdef __cplusplus extern "C" { #endif -static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - return &private_data->bitmap; -} - -static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - switch (i) { - case 0: - return &private_data->bitmap.buffer; - default: - return private_data->buffers + i - 1; - } +static inline struct ArrowBitmap* ArrowArrayValidityBitmap( + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + return &private_data->bitmap; +} + +static inline struct ArrowBuffer* ArrowArrayBuffer( + struct ArrowArray* array, int64_t i) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + switch (i) { + case 0: + return &private_data->bitmap.buffer; + default: + return private_data->buffers + i - 1; + } } // We don't currently support the case of unions where type_id != child_index; // however, these functions are used to keep track of where that assumption // is made. -static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, - int8_t type_id) { - NANOARROW_UNUSED(array); - return type_id; +static inline int8_t _ArrowArrayUnionChildIndex( + struct ArrowArray* array, int8_t type_id) { + NANOARROW_UNUSED(array); + return type_id; } -static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, - int8_t child_index) { - NANOARROW_UNUSED(array); - return child_index; +static inline int8_t _ArrowArrayUnionTypeId( + struct ArrowArray* array, int8_t child_index) { + NANOARROW_UNUSED(array); + return child_index; } -static inline int32_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { - if (*type_ids == '\0') { - return 0; - } - - int32_t i = 0; - long type_id; - char* end_ptr; - do { - type_id = strtol(type_ids, &end_ptr, 10); - if (end_ptr == type_ids || type_id < 0 || type_id > 127) { - return -1; +static inline int32_t _ArrowParseUnionTypeIds( + const char* type_ids, int8_t* out) { + if (*type_ids == '\0') { + return 0; } - if (out != NULL) { - out[i] = (int8_t)type_id; + int32_t i = 0; + long type_id; + char* end_ptr; + do { + type_id = strtol(type_ids, &end_ptr, 10); + if (end_ptr == type_ids || type_id < 0 || type_id > 127) { + return -1; + } + + if (out != NULL) { + out[i] = (int8_t)type_id; + } + + i++; + + type_ids = end_ptr; + if (*type_ids == '\0') { + return i; + } else if (*type_ids != ',') { + return -1; + } else { + type_ids++; + } + } while (1); + + return -1; +} + +static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices( + const int8_t* type_ids, int64_t n_type_ids, int64_t n_children) { + if (n_type_ids != n_children) { + return 0; } - i++; + for (int8_t i = 0; i < n_type_ids; i++) { + if (type_ids[i] != i) { + return 0; + } + } - type_ids = end_ptr; - if (*type_ids == '\0') { - return i; - } else if (*type_ids != ',') { - return -1; - } else { - type_ids++; + return 1; +} + +static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices( + const char* type_id_str, int64_t n_children) { + int8_t type_ids[128]; + int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + return _ArrowParsedUnionTypeIdsWillEqualChildIndices( + type_ids, n_type_ids, n_children); +} + +static inline ArrowErrorCode ArrowArrayStartAppending( + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + return EINVAL; + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + // Note that this value could be -1 if the type_ids string was + // invalid + if (private_data->union_type_id_is_child_index != 1) { + return EINVAL; + } else { + break; + } + default: + break; + } + if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; } - } while (1); - return -1; -} + // Initialize any data offset buffer with a single zero + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (private_data->layout.buffer_type[i] == + NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 64) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); + } else if ( + private_data->layout.buffer_type[i] == + NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); + } + } -static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, - int64_t n_type_ids, - int64_t n_children) { - if (n_type_ids != n_children) { - return 0; - } + // Start building any child arrays or dictionaries + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); + } - for (int8_t i = 0; i < n_type_ids; i++) { - if (type_ids[i] != i) { - return 0; + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); } - } - return 1; + return NANOARROW_OK; } -static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, - int64_t n_children) { - int8_t type_ids[128]; - int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); - return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); -} +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(buffer, buffer->size_bytes, 1)); + } -static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); + } - switch (private_data->storage_type) { - case NANOARROW_TYPE_UNINITIALIZED: - return EINVAL; - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_DENSE_UNION: - // Note that this value could be -1 if the type_ids string was invalid - if (private_data->union_type_id_is_child_index != 1) { - return EINVAL; - } else { - break; - } - default: - break; - } - if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { - return EINVAL; - } - - // Initialize any data offset buffer with a single zero - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && - private_data->layout.element_size_bits[i] == 64) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); - } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && - private_data->layout.element_size_bits[i] == 32) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); - } - } - - // Start building any child arrays or dictionaries - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); - } - - if (array->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); - } - - return NANOARROW_OK; + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + + return NANOARROW_OK; } -static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { - for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); - NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); - } - - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); - } - - if (array->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, - int64_t buffer_i, uint8_t value, - int64_t n) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); - int64_t bytes_required = - _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * +static inline ArrowErrorCode _ArrowArrayAppendBits( + struct ArrowArray* array, int64_t buffer_i, uint8_t value, int64_t n) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + int64_t bytes_required = _ArrowRoundUpToMultipleOf8( + private_data->layout + .element_size_bits[buffer_i] * (array->length + 1)) / - 8; - if (bytes_required > buffer->size_bytes) { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); - } + 8; + if (bytes_required > buffer->size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill( + buffer, 0, bytes_required - buffer->size_bytes)); + } - ArrowBitsSetTo(buffer->data, array->length, n, value); - return NANOARROW_OK; + ArrowBitsSetTo(buffer->data, array->length, n, value); + return NANOARROW_OK; } -static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, - int64_t n, uint8_t is_valid) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; +static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal( + struct ArrowArray* array, int64_t n, uint8_t is_valid) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; - if (n == 0) { - return NANOARROW_OK; - } - - // Some type-specific handling - switch (private_data->storage_type) { - case NANOARROW_TYPE_NA: - // (An empty value for a null array *is* a null) - array->null_count += n; - array->length += n; - return NANOARROW_OK; - - case NANOARROW_TYPE_DENSE_UNION: { - // Add one null to the first child and append n references to that child - int8_t type_id = _ArrowArrayUnionTypeId(array, 0); - NANOARROW_RETURN_NOT_OK( - _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); - for (int64_t i = 0; i < n; i++) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( - ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); - } - // For the purposes of array->null_count, union elements are never considered "null" - // even if some children contain nulls. - array->length += n; - return NANOARROW_OK; - } - - case NANOARROW_TYPE_SPARSE_UNION: { - // Add n nulls to the first child and append n references to that child - int8_t type_id = _ArrowArrayUnionTypeId(array, 0); - NANOARROW_RETURN_NOT_OK( - _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); - for (int64_t i = 1; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); - // For the purposes of array->null_count, union elements are never considered "null" - // even if some children contain nulls. - array->length += n; - return NANOARROW_OK; - } - - case NANOARROW_TYPE_FIXED_SIZE_LIST: - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( - array->children[0], n * private_data->layout.child_size_elements)); - break; - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); - } - break; - - default: - break; - } - - // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet - // and we need to append nulls, do it now. - if (!is_valid && private_data->bitmap.buffer.data == NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); - ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); - ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); - } else if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); - ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); - } - - // Add appropriate buffer fill - struct ArrowBuffer* buffer; - int64_t size_bytes; - - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - buffer = ArrowArrayBuffer(array, i); - size_bytes = private_data->layout.element_size_bits[i] / 8; - - switch (private_data->layout.buffer_type[i]) { - case NANOARROW_BUFFER_TYPE_NONE: - case NANOARROW_BUFFER_TYPE_VALIDITY: - continue; - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - // Append the current value at the end of the offset buffer for each element - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); - - for (int64_t j = 0; j < n; j++) { - ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), - size_bytes); + if (n == 0) { + return NANOARROW_OK; + } + + // Some type-specific handling + switch (private_data->storage_type) { + case NANOARROW_TYPE_NA: + // (An empty value for a null array *is* a null) + array->null_count += n; + array->length += n; + return NANOARROW_OK; + + case NANOARROW_TYPE_DENSE_UNION: { + // Add one null to the first child and append n references to that + // child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendEmptyInternal( + array->children[0], 1, is_valid)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + for (int64_t i = 0; i < n; i++) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), + (int32_t)array->children[0]->length - 1)); + } + // For the purposes of array->null_count, union elements are never + // considered "null" even if some children contain nulls. + array->length += n; + return NANOARROW_OK; } - // Skip the data buffer - i++; - continue; - case NANOARROW_BUFFER_TYPE_DATA: - // Zero out the next bit of memory - if (private_data->layout.element_size_bits[i] % 8 == 0) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); - } else { - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); + case NANOARROW_TYPE_SPARSE_UNION: { + // Add n nulls to the first child and append n references to that + // child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendEmptyInternal( + array->children[0], n, is_valid)); + for (int64_t i = 1; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayAppendEmpty(array->children[i], n)); + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + // For the purposes of array->null_count, union elements are never + // considered "null" even if some children contain nulls. + array->length += n; + return NANOARROW_OK; } - continue; - case NANOARROW_BUFFER_TYPE_TYPE_ID: - case NANOARROW_BUFFER_TYPE_UNION_OFFSET: - // These cases return above - return EINVAL; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( + array->children[0], + n * private_data->layout.child_size_elements)); + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayAppendEmpty(array->children[i], n)); + } + break; + + default: + break; } - } - - array->length += n; - array->null_count += n * !is_valid; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { - return _ArrowArrayAppendEmptyInternal(array, n, 0); -} - -static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { - return _ArrowArrayAppendEmptyInternal(array, n, 1); -} - -static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, - int64_t value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_INT64: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); - break; - case NANOARROW_TYPE_INT32: - _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); - break; - case NANOARROW_TYPE_INT16: - _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); - break; - case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); - break; - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); - return ArrowArrayAppendUInt(array, value); - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - case NANOARROW_TYPE_BOOL: - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, - uint64_t value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_UINT64: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); - break; - case NANOARROW_TYPE_UINT32: - _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); - break; - case NANOARROW_TYPE_UINT16: - _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); - break; - case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); - break; - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); - return ArrowArrayAppendInt(array, value); - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - case NANOARROW_TYPE_BOOL: - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, - double value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, - struct ArrowBufferView value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); - struct ArrowBuffer* data_buffer = ArrowArrayBuffer( - array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); - int32_t offset; - int64_t large_offset; - int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - offset = ((int32_t*)offset_buffer->data)[array->length]; - if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { - return EOVERFLOW; - } - - offset += (int32_t)value.size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - large_offset = ((int64_t*)offset_buffer->data)[array->length]; - large_offset += value.size_bytes; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - if (value.size_bytes != fixed_size_bytes) { - return EINVAL; - } - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - default: - return EINVAL; - } + // Append n is_valid bits to the validity bitmap. If we haven't allocated a + // bitmap yet and we need to append nulls, do it now. + if (!is_valid && private_data->bitmap.buffer.data == NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapReserve(&private_data->bitmap, array->length + n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } else if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } + // Add appropriate buffer fill + struct ArrowBuffer* buffer; + int64_t size_bytes; + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + buffer = ArrowArrayBuffer(array, i); + size_bytes = private_data->layout.element_size_bits[i] / 8; + + switch (private_data->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VALIDITY: + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Append the current value at the end of the offset buffer for + // each element + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(buffer, size_bytes * n)); + + for (int64_t j = 0; j < n; j++) { + ArrowBufferAppendUnsafe( + buffer, + buffer->data + size_bytes * (array->length + j), + size_bytes); + } + + // Skip the data buffer + i++; + continue; + case NANOARROW_BUFFER_TYPE_DATA: + // Zero out the next bit of memory + if (private_data->layout.element_size_bits[i] % 8 == 0) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(buffer, 0, size_bytes * n)); + } else { + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendBits(array, i, 0, n)); + } + continue; + + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + // These cases return above + return EINVAL; + } + } - array->length++; - return NANOARROW_OK; + array->length += n; + array->null_count += n * !is_valid; + return NANOARROW_OK; } -static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, - struct ArrowStringView value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; +static inline ArrowErrorCode ArrowArrayAppendNull( + struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 0); +} + +static inline ArrowErrorCode ArrowArrayAppendEmpty( + struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 1); +} + +static inline ArrowErrorCode ArrowArrayAppendInt( + struct ArrowArray* array, int64_t value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INT64: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); + break; + case NANOARROW_TYPE_INT32: + _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, (int32_t)value)); + break; + case NANOARROW_TYPE_INT16: + _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt16(data_buffer, (int16_t)value)); + break; + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(data_buffer, (int8_t)value)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + return ArrowArrayAppendUInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } - struct ArrowBufferView buffer_view; - buffer_view.data.data = value.data; - buffer_view.size_bytes = value.size_bytes; + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_LARGE_BINARY: - return ArrowArrayAppendBytes(array, buffer_view); - default: - return EINVAL; - } + array->length++; + return NANOARROW_OK; } -static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, - const struct ArrowInterval* value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; +static inline ArrowErrorCode ArrowArrayAppendUInt( + struct ArrowArray* array, uint64_t value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UINT64: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); + break; + case NANOARROW_TYPE_UINT32: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); + break; + case NANOARROW_TYPE_UINT16: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); + break; + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); + break; + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); + return ArrowArrayAppendInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } - switch (private_data->storage_type) { - case NANOARROW_TYPE_INTERVAL_MONTHS: { - if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { - return EINVAL; - } + array->length++; + return NANOARROW_OK; +} - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); - break; +static inline ArrowErrorCode ArrowArrayAppendDouble( + struct ArrowArray* array, double value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, &value, sizeof(double))); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + default: + return EINVAL; } - case NANOARROW_TYPE_INTERVAL_DAY_TIME: { - if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { - return EINVAL; - } - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); - break; + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { - if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { - return EINVAL; - } - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); - break; + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendBytes( + struct ArrowArray* array, struct ArrowBufferView value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, + 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; + } + + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; } - default: - return EINVAL; - } - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } - array->length++; - return NANOARROW_OK; + array->length++; + return NANOARROW_OK; } -static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - const struct ArrowDecimal* value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); +static inline ArrowErrorCode ArrowArrayAppendString( + struct ArrowArray* array, struct ArrowStringView value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; - switch (private_data->storage_type) { - case NANOARROW_TYPE_DECIMAL128: - if (value->n_words != 2) { - return EINVAL; - } else { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); - break; - } - case NANOARROW_TYPE_DECIMAL256: - if (value->n_words != 4) { - return EINVAL; - } else { + struct ArrowBufferView buffer_view; + buffer_view.data.data = value.data; + buffer_view.size_bytes = value.size_bytes; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return ArrowArrayAppendBytes(array, buffer_view); + default: + return EINVAL; + } +} + +static inline ArrowErrorCode ArrowArrayAppendInterval( + struct ArrowArray* array, const struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); - break; - } - default: - return EINVAL; - } + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDecimal( + struct ArrowArray* array, const struct ArrowDecimal* value) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + if (value->n_words != 2) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value->words, 2 * sizeof(uint64_t))); + break; + } + case NANOARROW_TYPE_DECIMAL256: + if (value->n_words != 4) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend( + data_buffer, value->words, 4 * sizeof(uint64_t))); + break; + } + default: + return EINVAL; + } - array->length++; - return NANOARROW_OK; + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - int64_t child_length; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: - child_length = array->children[0]->length; - if (child_length > INT32_MAX) { - return EOVERFLOW; - } - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); - break; - case NANOARROW_TYPE_LARGE_LIST: - child_length = array->children[0]->length; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); - break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - child_length = array->children[0]->length; - if (child_length != - ((array->length + 1) * private_data->layout.child_size_elements)) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + int64_t child_length; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + child_length = array->children[0]->length; + if (child_length > INT32_MAX) { + return EOVERFLOW; + } + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)child_length)); + break; + case NANOARROW_TYPE_LARGE_LIST: + child_length = array->children[0]->length; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64( + ArrowArrayBuffer(array, 1), child_length)); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_length = array->children[0]->length; + if (child_length != ((array->length + 1) * + private_data->layout.child_size_elements)) { + return EINVAL; + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + child_length = array->children[i]->length; + if (child_length != (array->length + 1)) { + return EINVAL; + } + } + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishUnionElement( + struct ArrowArray* array, int8_t type_id) { + struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*) + array->private_data; + + int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); + if (child_index < 0 || child_index >= array->n_children) { return EINVAL; - } - break; - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array->n_children; i++) { - child_length = array->children[i]->length; - if (child_length != (array->length + 1)) { - return EINVAL; - } - } - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, - int8_t type_id) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); - if (child_index < 0 || child_index >= array->n_children) { - return EINVAL; - } - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - // Append the target child length to the union offsets buffer - _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( - ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); - break; - case NANOARROW_TYPE_SPARSE_UNION: - // Append one empty to any non-target column that isn't already the right length - // or abort if appending a null will result in a column with invalid length - for (int64_t i = 0; i < array->n_children; i++) { - if (i == child_index || array->children[i]->length == (array->length + 1)) { - continue; - } + } - if (array->children[i]->length != array->length) { - return EINVAL; - } + switch (private_data->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + // Append the target child length to the union offsets buffer + _NANOARROW_CHECK_RANGE( + array->children[child_index]->length, 0, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), + (int32_t)array->children[child_index]->length - 1)); + break; + case NANOARROW_TYPE_SPARSE_UNION: + // Append one empty to any non-target column that isn't already the + // right length or abort if appending a null will result in a column + // with invalid length + for (int64_t i = 0; i < array->n_children; i++) { + if (i == child_index || + array->children[i]->length == (array->length + 1)) { + continue; + } + + if (array->children[i]->length != array->length) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowArrayAppendEmpty(array->children[i], 1)); + } + + break; + default: + return EINVAL; + } + + // Write to the type_ids buffer + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); + array->length++; + return NANOARROW_OK; +} + +static inline void ArrowArrayViewMove( + struct ArrowArrayView* src, struct ArrowArrayView* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayView)); + ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); +} + +static inline int8_t ArrowArrayViewIsNull( + const struct ArrowArrayView* array_view, int64_t i) { + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return 0x01; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0x00; + default: + return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); + } +} - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); - } - - break; - default: - return EINVAL; - } - - // Write to the type_ids buffer - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); - array->length++; - return NANOARROW_OK; -} - -static inline void ArrowArrayViewMove(struct ArrowArrayView* src, - struct ArrowArrayView* dst) { - memcpy(dst, src, sizeof(struct ArrowArrayView)); - ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); -} - -static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, - int64_t i) { - const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; - i += array_view->offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_NA: - return 0x01; - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - // Unions are "never null" in Arrow land - return 0x00; - default: - return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); - } -} - -static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, - int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - return array_view->buffer_views[0].data.as_int8[i]; - default: - return -1; - } +static inline int8_t ArrowArrayViewUnionTypeId( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->buffer_views[0].data.as_int8[i]; + default: + return -1; + } } static inline int8_t ArrowArrayViewUnionChildIndex( const struct ArrowArrayView* array_view, int64_t i) { - int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); - if (array_view->union_type_id_map == NULL) { - return type_id; - } else { - return array_view->union_type_id_map[type_id]; - } + int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); + if (array_view->union_type_id_map == NULL) { + return type_id; + } else { + return array_view->union_type_id_map[type_id]; + } } static inline int64_t ArrowArrayViewUnionChildOffset( const struct ArrowArrayView* array_view, int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - return array_view->buffer_views[1].data.as_int32[i]; - case NANOARROW_TYPE_SPARSE_UNION: - return i; - default: - return -1; - } + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_SPARSE_UNION: + return i; + default: + return -1; + } } static inline int64_t ArrowArrayViewListChildOffset( const struct ArrowArrayView* array_view, int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_LIST: - return array_view->buffer_views[1].data.as_int32[i]; - case NANOARROW_TYPE_LARGE_LIST: - return array_view->buffer_views[1].data.as_int64[i]; - default: - return -1; - } -} - -static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, - int64_t i) { - const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - i += array_view->offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INTERVAL_MONTHS: - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return (int64_t)data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return (int64_t)data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return INT64_MAX; - } + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewGetIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (int64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return INT64_MAX; + } } static inline uint64_t ArrowArrayViewGetUIntUnsafe( const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INTERVAL_MONTHS: - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return (uint64_t)data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return (uint64_t)data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return UINT64_MAX; - } + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (uint64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return UINT64_MAX; + } } static inline double ArrowArrayViewGetDoubleUnsafe( const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return (double)data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return (double)data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return DBL_MAX; - } + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return (double)data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return (double)data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return DBL_MAX; + } } static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; - const char* data_view = array_view->buffer_views[2].data.as_char; - - struct ArrowStringView view; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - view.data = data_view + offsets_view->data.as_int32[i]; - view.size_bytes = - offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; - break; - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - view.data = data_view + offsets_view->data.as_int64[i]; - view.size_bytes = - offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - view.size_bytes = array_view->layout.element_size_bits[1] / 8; - view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); - break; - default: - view.data = NULL; - view.size_bytes = 0; - break; - } - - return view; + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const char* data_view = array_view->buffer_views[2].data.as_char; + + struct ArrowStringView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.data = data_view + offsets_view->data.as_int32[i]; + view.size_bytes = offsets_view->data.as_int32[i + 1] - + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.data = data_view + offsets_view->data.as_int64[i]; + view.size_bytes = offsets_view->data.as_int64[i + 1] - + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data = array_view->buffer_views[1].data.as_char + + (i * view.size_bytes); + break; + default: + view.data = NULL; + view.size_bytes = 0; + break; + } + + return view; } static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; - const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; - - struct ArrowBufferView view; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - view.size_bytes = - offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; - view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; - break; - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - view.size_bytes = - offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; - view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - view.size_bytes = array_view->layout.element_size_bits[1] / 8; - view.data.as_uint8 = - array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); - break; - default: - view.data.data = NULL; - view.size_bytes = 0; - break; - } - - return view; + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; + + struct ArrowBufferView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.size_bytes = offsets_view->data.as_int32[i + 1] - + offsets_view->data.as_int32[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.size_bytes = offsets_view->data.as_int64[i + 1] - + offsets_view->data.as_int64[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data.as_uint8 = array_view->buffer_views[1].data.as_uint8 + + (i * view.size_bytes); + break; + default: + view.data.data = NULL; + view.size_bytes = 0; + break; + } + + return view; } static inline void ArrowArrayViewGetIntervalUnsafe( - const struct ArrowArrayView* array_view, int64_t i, struct ArrowInterval* out) { - const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INTERVAL_MONTHS: { - const size_t size = sizeof(int32_t); - memcpy(&out->months, data_view + i * size, sizeof(int32_t)); - break; - } - case NANOARROW_TYPE_INTERVAL_DAY_TIME: { - const size_t size = sizeof(int32_t) + sizeof(int32_t); - memcpy(&out->days, data_view + i * size, sizeof(int32_t)); - memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); - break; - } - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { - const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); - memcpy(&out->months, data_view + i * size, sizeof(int32_t)); - memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); - memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); - break; - } - default: - break; - } -} - -static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, - int64_t i, struct ArrowDecimal* out) { - i += array_view->offset; - const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; - switch (array_view->storage_type) { - case NANOARROW_TYPE_DECIMAL128: - ArrowDecimalSetBytes(out, data_view + (i * 16)); - break; - case NANOARROW_TYPE_DECIMAL256: - ArrowDecimalSetBytes(out, data_view + (i * 32)); - break; - default: - memset(out->words, 0, sizeof(out->words)); - break; - } + const struct ArrowArrayView* array_view, + int64_t i, + struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe( + const struct ArrowArrayView* array_view, + int64_t i, + struct ArrowDecimal* out) { + i += array_view->offset; + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + ArrowDecimalSetBytes(out, data_view + (i * 16)); + break; + case NANOARROW_TYPE_DECIMAL256: + ArrowDecimalSetBytes(out, data_view + (i * 32)); + break; + default: + memset(out->words, 0, sizeof(out->words)); + break; + } } #ifdef __cplusplus From 757746b3030b46bf2042f160e20986e29ca6937f Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 18 Mar 2024 09:58:07 -0500 Subject: [PATCH 15/39] Add additional necessary strdup --- libtiledbsoma/CMakeLists.txt | 4 ---- libtiledbsoma/src/utils/arrow_adapter.cc | 21 +++------------------ 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/libtiledbsoma/CMakeLists.txt b/libtiledbsoma/CMakeLists.txt index a055d71cd2..a1a0de14a6 100644 --- a/libtiledbsoma/CMakeLists.txt +++ b/libtiledbsoma/CMakeLists.txt @@ -187,10 +187,6 @@ if(MSVC) else() add_compile_options(-Wall -Wextra) - if(TILEDBSOMA_ENABLE_WERROR) - add_compile_options(-Werror) - endif() - # Build-specific flags if(CMAKE_BUILD_TYPE MATCHES "Debug") add_compile_options(-DDEBUG -O0 -g3 -ggdb3 -gdwarf-3) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index b0b44b4649..460ed38af9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -328,15 +328,13 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError( ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); -#if 0 - schema->format = to_arrow_format(column->type()).data(); - schema->name = column->name().data(); + schema->format = strdup(to_arrow_format(column->type()).data()); + schema->name = strdup(column->name().data()); schema->metadata = nullptr; schema->flags = 0; schema->n_children = 0; schema->children = nullptr; schema->dictionary = nullptr; -#endif schema->release = &release_schema; schema->private_data = nullptr; @@ -368,7 +366,6 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { array->n_buffers, column->is_nullable())); -#if 0 array->null_count = 0; array->offset = 0; array->n_buffers = n_buffers; @@ -376,7 +373,6 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { array->buffers = nullptr; array->children = nullptr; array->dictionary = nullptr; -#endif array->release = &release_array; array->private_data = (void*)arrow_buffer; @@ -436,7 +432,6 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError( ArrowSchemaAllocateChildren(dict_sch, 0), "Bad schema children alloc"); -#if 0 dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); dict_sch->name = nullptr; dict_sch->metadata = nullptr; @@ -446,7 +441,6 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_sch->dictionary = nullptr; dict_sch->release = &release_schema; dict_sch->private_data = nullptr; -#endif exitIfError( ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); @@ -454,16 +448,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); dict_arr->release = &release_array; -#if 0 - dict_arr->null_count = 0; - dict_arr->offset = 0; - dict_arr->n_buffers = n_buf; - dict_arr->n_children = 0; - dict_arr->buffers = nullptr; - dict_arr->children = nullptr; - dict_arr->dictionary = nullptr; - dict_arr->private_data = nullptr; -#endif + const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; // TODO string types currently get the data and offset // buffers from ColumnBuffer::enum_offsets and From 5fcafa4df178581ac82761266740f50b3c37b609 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 19 Mar 2024 09:42:59 -0500 Subject: [PATCH 16/39] No longer to protect one statement --- libtiledbsoma/src/utils/arrow_adapter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 460ed38af9..7b2b286ca1 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -51,7 +51,7 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { } if (schema->format != nullptr) { LOG_TRACE("[ArrowAdapter] release_schema schema->format"); - //free((void*)schema->format); + free((void*)schema->format); schema->format = nullptr; } if (schema->metadata != nullptr) { From a7f476f241700cbabf8a232ff6067409a67fec48 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 19 Mar 2024 11:36:34 -0500 Subject: [PATCH 17/39] Support TILEDB_DATETIME_DAY aka Date as well --- libtiledbsoma/src/utils/arrow_adapter.cc | 65 +++++++++++++++--------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 7b2b286ca1..8b5995be8b 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2022-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -131,9 +131,8 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { } if (array->dictionary != nullptr) { - // -- TODO: This can lead to segfault on some data sets and could be - // cause - // by how we fill arrow data structures. This should pass. + // TODO: This can lead to segfault on some data sets, could be caused + // by how we fill arrow data structures. This should pass. // if (array->dictionary->release != nullptr) { // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); // release_array(array->dictionary); @@ -158,15 +157,14 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( arrow_schema->n_children = ndim + nattr; arrow_schema->release = &ArrowAdapter::release_schema; arrow_schema->children = (ArrowSchema**)malloc( - arrow_schema->n_children * - sizeof(ArrowSchema*)); // new ArrowSchema*[arrow_schema->n_children]; + arrow_schema->n_children * sizeof(ArrowSchema*)); ArrowSchema* child = nullptr; for (uint32_t i = 0; i < ndim; ++i) { auto dim = tiledb_schema.domain().dimension(i); child = arrow_schema->children[i] = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); // new ArrowSchema; + sizeof(ArrowSchema)); child->format = strdup( ArrowAdapter::to_arrow_format(dim.type()).data()); child->name = strdup(dim.name().c_str()); @@ -181,7 +179,7 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( for (uint32_t i = 0; i < nattr; ++i) { auto attr = tiledb_schema.attribute(i); child = arrow_schema->children[ndim + i] = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); // new ArrowSchema; + sizeof(ArrowSchema)); child->format = strdup( ArrowAdapter::to_arrow_format(attr.type()).data()); child->name = strdup(attr.name().c_str()); @@ -238,7 +236,7 @@ std::pair ArrowAdapter::_get_data_and_length( // Allocate a single byte to copy the bits into size_t sz = 1; - dst = malloc(sz); // new const void*[sz]; + dst = malloc(sz); std::memcpy((void*)dst, &src, sz); return std::pair(dst, data.size()); @@ -313,6 +311,7 @@ inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { fmt::format("ArrowAdapter: Arrow Error {} ", msg)); } + std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { std::unique_ptr schema = std::make_unique(); @@ -338,27 +337,24 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { schema->release = &release_schema; schema->private_data = nullptr; - int n_buffers = column->is_var() ? 3 : - 2; // this will be 2 for enumerations - // and 3 for char vectors + int n_buffers = column->is_var() ? 3 : // this will be 3 for char vecs + 2; // and 2 for enumerations // Create an ArrowBuffer to manage the lifetime of `column`. - // - `arrow_buffer` holds a shared_ptr to `column`, which - // increments + // - `arrow_buffer` holds shared_ptr to `column`, increments // the use count and keeps the ColumnBuffer data alive. // - When the arrow array is released, `array->release()` is - // called with - // `arrow_buffer` in `private_data`. `arrow_buffer` is - // deleted, which decrements the the `column` use count. When - // the `column` use count reaches 0, the ColumnBuffer data - // will be deleted. + // called with `arrow_buffer` in `private_data`. + // `arrow_buffer` is deleted, which decrements the the + // `column` use count. When the `column` use count reaches + // 0, the ColumnBuffer data will be deleted. auto arrow_buffer = new ArrowBuffer(column); exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); array->length = column->size(); - LOG_DEBUG(fmt::format( + LOG_TRACE(fmt::format( "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", to_arrow_format(column->type()).data(), column->name().data(), @@ -391,8 +387,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; // turns out it is also set by - // default + schema->flags |= ARROW_FLAG_NULLABLE; // it is also set by default // Count nulls for (auto v : column->validity()) { @@ -403,8 +398,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->validity_to_bitmap(); array->buffers[0] = column->validity().data(); } else { - schema->flags = 0; // because ArrowSchemaInitFromType leads to NULLABLE - // set + schema->flags = 0; // as ArrowSchemaInitFromType leads to NULLABLE set } if (column->is_ordered()) { @@ -416,6 +410,19 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->data_to_bitmap(); } + // Workaround for date + if (column->type() == TILEDB_DATETIME_DAY) { + // TODO: Put in ColumnBuffer + size_t n = array->length; + std::vector indata(n); + std::memcpy(indata.data(), column->data().data(), sizeof(int64_t) * n); + std::vector vec(n); + for (size_t i=0; i(indata[i]); + } + std::memcpy((void*)array->buffers[n_buffers - 1], vec.data(), sizeof(int32_t) * n); + } + if (column->has_enumeration()) { auto dict_sch = (ArrowSchema*)malloc( sizeof(ArrowSchema)); // new ArrowSchema; @@ -449,6 +456,10 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { "Bad array children alloc"); dict_arr->release = &release_array; const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; + dict_arr->private_data = nullptr; + dict_arr->buffers = (const void**)malloc( + sizeof(void*) * n_buf); + dict_arr->buffers[0] = nullptr; // validity: none here // TODO string types currently get the data and offset // buffers from ColumnBuffer::enum_offsets and @@ -521,6 +532,8 @@ std::string_view ArrowAdapter::to_arrow_format( return "ttu"; case TILEDB_TIME_NS: return "ttn"; + case TILEDB_DATETIME_DAY: + return "tdD"; case TILEDB_DATETIME_SEC: return "tss:"; case TILEDB_DATETIME_MS: @@ -567,6 +580,10 @@ enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { return NANOARROW_TYPE_BOOL; else if (sv == "tss:") return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsm:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tdD") + return NANOARROW_TYPE_DOUBLE; // R Date: fractional days since epoch else if (sv == "z") return NANOARROW_TYPE_BINARY; else if (sv == "Z") From 531156a8baf6fac72b94599dfac891c89f67c281 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 19 Mar 2024 12:02:32 -0500 Subject: [PATCH 18/39] Meh --- libtiledbsoma/src/utils/arrow_adapter.cc | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 8b5995be8b..be73205ead 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -204,6 +204,7 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( dict->format = strdup( ArrowAdapter::to_arrow_format(enmr.type(), false).data()); } + dict->name = strdup(enmr.name().c_str()); dict->metadata = nullptr; dict->flags = 0; @@ -311,7 +312,6 @@ inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { fmt::format("ArrowAdapter: Arrow Error {} ", msg)); } - std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { std::unique_ptr schema = std::make_unique(); @@ -337,8 +337,8 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { schema->release = &release_schema; schema->private_data = nullptr; - int n_buffers = column->is_var() ? 3 : // this will be 3 for char vecs - 2; // and 2 for enumerations + // this will be 3 for char vecs and 2 for enumerations + int n_buffers = column->is_var() ? 3 : 2; // Create an ArrowBuffer to manage the lifetime of `column`. // - `arrow_buffer` holds shared_ptr to `column`, increments @@ -415,12 +415,16 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { // TODO: Put in ColumnBuffer size_t n = array->length; std::vector indata(n); - std::memcpy(indata.data(), column->data().data(), sizeof(int64_t) * n); + std::memcpy( + indata.data(), column->data().data(), sizeof(int64_t) * n); std::vector vec(n); - for (size_t i=0; i(indata[i]); } - std::memcpy((void*)array->buffers[n_buffers - 1], vec.data(), sizeof(int32_t) * n); + std::memcpy( + (void*)array->buffers[n_buffers - 1], + vec.data(), + sizeof(int32_t) * n); } if (column->has_enumeration()) { @@ -457,8 +461,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_arr->release = &release_array; const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; dict_arr->private_data = nullptr; - dict_arr->buffers = (const void**)malloc( - sizeof(void*) * n_buf); + dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); dict_arr->buffers[0] = nullptr; // validity: none here // TODO string types currently get the data and offset @@ -579,11 +582,11 @@ enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { else if (sv == "b") return NANOARROW_TYPE_BOOL; else if (sv == "tss:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently else if (sv == "tsm:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently else if (sv == "tdD") - return NANOARROW_TYPE_DOUBLE; // R Date: fractional days since epoch + return NANOARROW_TYPE_DOUBLE; // R Date: fractional days since epoch else if (sv == "z") return NANOARROW_TYPE_BINARY; else if (sv == "Z") From 7db58ff7e915b9dbf49f74ebb3b3d28b30b41929 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 19 Mar 2024 12:54:58 -0500 Subject: [PATCH 19/39] Meh with version 14.0.0 and not 14.0.6 because ... sure --- libtiledbsoma/src/utils/arrow_adapter.cc | 994 +++++++++++------------ 1 file changed, 482 insertions(+), 512 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index be73205ead..b2c810f876 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -38,264 +38,241 @@ namespace tiledbsoma { using namespace tiledb; -void ArrowAdapter::release_schema(struct ArrowSchema* schema) { - LOG_DEBUG("[ArrowAdapter] release_schema"); - if (schema->name != nullptr) - LOG_DEBUG( - fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); - - if (schema->name != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->name"); - free((void*)schema->name); - schema->name = nullptr; - } - if (schema->format != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->format"); - free((void*)schema->format); - schema->format = nullptr; - } - if (schema->metadata != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->metadata"); - free((void*)schema->metadata); - schema->metadata = nullptr; - } - - if (schema->children != nullptr) { - for (auto i = 0; i < schema->n_children; i++) { - if (schema->children[i] != nullptr) { - if (schema->children[i]->release != nullptr) { - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema schema->child {} " - "release", - i)); - release_schema(schema->children[i]); - } - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema schema->child {} free", i)); - free(schema->children[i]); - } +void ArrowAdapter::release_schema(struct ArrowSchema *schema) { + if (schema->name != nullptr) + LOG_DEBUG( + fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); + + if (schema->name != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->name"); + free((void *)schema->name); + schema->name = nullptr; + } + if (schema->format != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->format"); + free((void *)schema->format); + schema->format = nullptr; + } + if (schema->metadata != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->metadata"); + free((void *)schema->metadata); + schema->metadata = nullptr; + } + + if (schema->children != nullptr) { + for (auto i = 0; i < schema->n_children; i++) { + if (schema->children[i] != nullptr) { + if (schema->children[i]->release != nullptr) { + LOG_TRACE( + fmt::format("[ArrowAdapter] release_schema schema->child {} " + "release", + i)); + release_schema(schema->children[i]); } - LOG_TRACE("[ArrowAdapter] release_schema schema->children"); - free(schema->children); - schema->children = nullptr; + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} free", i)); + free(schema->children[i]); + } } + LOG_TRACE("[ArrowAdapter] release_schema schema->children"); + free(schema->children); + schema->children = nullptr; + } - if (schema->dictionary != nullptr) { - if (schema->dictionary->release != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->dict release"); - release_schema(schema->dictionary); - } - LOG_TRACE("[ArrowAdapter] release_schema schema->dict free"); - free(schema->dictionary); - schema->dictionary = nullptr; + if (schema->dictionary != nullptr) { + if (schema->dictionary->release != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->dict release"); + release_schema(schema->dictionary); } + LOG_TRACE("[ArrowAdapter] release_schema schema->dict free"); + free(schema->dictionary); + schema->dictionary = nullptr; + } - schema->release = nullptr; - LOG_TRACE("[ArrowAdapter] release_schema done"); + schema->release = nullptr; + LOG_TRACE("[ArrowAdapter] release_schema done"); } -void ArrowAdapter::release_array(struct ArrowArray* array) { - auto arrow_buffer = static_cast(array->private_data); - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_array {} use_count={}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count())); - - // Delete the ArrowBuffer, which was allocated with new. - // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the - // underlying ColumnBuffer, the ColumnBuffer will be deleted. - delete arrow_buffer; - - if (array->buffers != nullptr) { - delete[] array->buffers; - array->buffers = nullptr; - } +void ArrowAdapter::release_array(struct ArrowArray *array) { + auto arrow_buffer = static_cast(array->private_data); + LOG_TRACE(fmt::format("[ArrowAdapter] release_array {} use_count={}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count())); - if (array->children != nullptr) { - for (auto i = 0; i < array->n_children; i++) { - if (array->children[i] != nullptr) { - if (array->children[i]->release != nullptr) { - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema array->child {} release", - i)); - release_array(array->children[i]); - } - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema array->child {} free", i)); - free(array->children[i]); - } - } - LOG_TRACE("[ArrowAdapter] release_array array->children"); - free(array->children); - array->children = nullptr; - } + // Delete the ArrowBuffer, which was allocated with new. + // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the + // underlying ColumnBuffer, the ColumnBuffer will be deleted. + delete arrow_buffer; - if (array->dictionary != nullptr) { - // TODO: This can lead to segfault on some data sets, could be caused - // by how we fill arrow data structures. This should pass. - // if (array->dictionary->release != nullptr) { - // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); - // release_array(array->dictionary); - //} - LOG_TRACE("[ArrowAdapter] release_array array->dict free"); - free(array->dictionary); - array->dictionary = nullptr; + if (array->buffers != nullptr) { + delete[] array->buffers; + array->buffers = nullptr; + } + + if (array->children != nullptr) { + for (auto i = 0; i < array->n_children; i++) { + if (array->children[i] != nullptr) { + if (array->children[i]->release != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} release", i)); + release_array(array->children[i]); + } + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} free", i)); + free(array->children[i]); + } } + LOG_TRACE("[ArrowAdapter] release_array array->children"); + free(array->children); + array->children = nullptr; + } + + if (array->dictionary != nullptr) { + // TODO: This can lead to segfault on some data sets, could be caused + // by how we fill arrow data structures. This should pass. + // if (array->dictionary->release != nullptr) { + // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); + // release_array(array->dictionary); + //} + LOG_TRACE("[ArrowAdapter] release_array array->dict free"); + free(array->dictionary); + array->dictionary = nullptr; + } - array->release = nullptr; - LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); + array->release = nullptr; + LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); } std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array) { - auto tiledb_schema = tiledb_array->schema(); - auto ndim = tiledb_schema.domain().ndim(); - auto nattr = tiledb_schema.attribute_num(); - - std::unique_ptr arrow_schema = std::make_unique(); - arrow_schema->format = strdup("+s"); - arrow_schema->n_children = ndim + nattr; - arrow_schema->release = &ArrowAdapter::release_schema; - arrow_schema->children = (ArrowSchema**)malloc( - arrow_schema->n_children * sizeof(ArrowSchema*)); - - ArrowSchema* child = nullptr; - - for (uint32_t i = 0; i < ndim; ++i) { - auto dim = tiledb_schema.domain().dimension(i); - child = arrow_schema->children[i] = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); - child->format = strdup( - ArrowAdapter::to_arrow_format(dim.type()).data()); - child->name = strdup(dim.name().c_str()); - child->metadata = nullptr; - child->flags = 0; - child->n_children = 0; - child->dictionary = nullptr; - child->children = nullptr; - child->release = &ArrowAdapter::release_schema; - } - - for (uint32_t i = 0; i < nattr; ++i) { - auto attr = tiledb_schema.attribute(i); - child = arrow_schema->children[ndim + i] = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); - child->format = strdup( - ArrowAdapter::to_arrow_format(attr.type()).data()); - child->name = strdup(attr.name().c_str()); - child->metadata = nullptr; - child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; - child->n_children = 0; - child->children = nullptr; - child->dictionary = nullptr; - - auto enmr_name = AttributeExperimental::get_enumeration_name( - *ctx, attr); - if (enmr_name.has_value()) { - auto enmr = ArrayExperimental::get_enumeration( - *ctx, *tiledb_array, attr.name()); - - auto dict = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); // new ArrowSchema; - if (enmr.type() == TILEDB_STRING_ASCII or - enmr.type() == TILEDB_CHAR) { - dict->format = strdup("z"); - } else { - dict->format = strdup( - ArrowAdapter::to_arrow_format(enmr.type(), false).data()); - } - - dict->name = strdup(enmr.name().c_str()); - dict->metadata = nullptr; - dict->flags = 0; - dict->n_children = 0; - dict->children = nullptr; - dict->dictionary = nullptr; - dict->release = &ArrowAdapter::release_schema; - dict->private_data = nullptr; - child->dictionary = dict; - } - child->release = &ArrowAdapter::release_schema; + auto tiledb_schema = tiledb_array->schema(); + auto ndim = tiledb_schema.domain().ndim(); + auto nattr = tiledb_schema.attribute_num(); + + std::unique_ptr arrow_schema = std::make_unique(); + arrow_schema->format = strdup("+s"); + arrow_schema->n_children = ndim + nattr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->children = + (ArrowSchema **)malloc(arrow_schema->n_children * sizeof(ArrowSchema *)); + + ArrowSchema *child = nullptr; + + for (uint32_t i = 0; i < ndim; ++i) { + auto dim = tiledb_schema.domain().dimension(i); + child = arrow_schema->children[i] = + (ArrowSchema *)malloc(sizeof(ArrowSchema)); + child->format = strdup(ArrowAdapter::to_arrow_format(dim.type()).data()); + child->name = strdup(dim.name().c_str()); + child->metadata = nullptr; + child->flags = 0; + child->n_children = 0; + child->dictionary = nullptr; + child->children = nullptr; + child->release = &ArrowAdapter::release_schema; + } + + for (uint32_t i = 0; i < nattr; ++i) { + auto attr = tiledb_schema.attribute(i); + child = arrow_schema->children[ndim + i] = + (ArrowSchema *)malloc(sizeof(ArrowSchema)); + child->format = strdup(ArrowAdapter::to_arrow_format(attr.type()).data()); + child->name = strdup(attr.name().c_str()); + child->metadata = nullptr; + child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; + child->n_children = 0; + child->children = nullptr; + child->dictionary = nullptr; + + auto enmr_name = AttributeExperimental::get_enumeration_name(*ctx, attr); + if (enmr_name.has_value()) { + auto enmr = + ArrayExperimental::get_enumeration(*ctx, *tiledb_array, attr.name()); + auto dict = (ArrowSchema *)malloc(sizeof(ArrowSchema)); + dict->format = + strdup(ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + dict->name = strdup(enmr.name().c_str()); + dict->metadata = nullptr; + dict->flags = 0; + dict->n_children = 0; + dict->children = nullptr; + dict->dictionary = nullptr; + dict->release = &ArrowAdapter::release_schema; + dict->private_data = nullptr; + child->dictionary = dict; } + child->release = &ArrowAdapter::release_schema; + } - return arrow_schema; + return arrow_schema; } -std::pair ArrowAdapter::_get_data_and_length( - Enumeration& enmr, const void* dst) { - switch (enmr.type()) { - case TILEDB_BOOL: { - // We must handle this specially because vector does - // not store elements contiguously in memory - auto data = enmr.as_vector(); - - // Represent the Boolean vector with, at most, the last two - // bits. In Arrow, Boolean values are LSB packed - uint8_t src = 0; - for (size_t i = 0; i < data.size(); ++i) - src |= (data[i] << i); - - // Allocate a single byte to copy the bits into - size_t sz = 1; - dst = malloc(sz); - std::memcpy((void*)dst, &src, sz); - - return std::pair(dst, data.size()); - } - case TILEDB_INT8: { - auto data = enmr.as_vector(); - return std::pair(_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT8: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_INT16: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT16: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_INT32: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT32: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_INT64: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT64: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_FLOAT32: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_FLOAT64: { - auto data = enmr.as_vector(); - return std::pair( - ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - default: - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB dict datatype: {} ", - tiledb::impl::type_to_str(enmr.type()))); - } +std::pair +ArrowAdapter::_get_data_and_length(Enumeration &enmr, const void *dst) { + switch (enmr.type()) { + case TILEDB_BOOL: { + // We must handle this specially because vector does + // not store elements contiguously in memory + auto data = enmr.as_vector(); + + // Represent the Boolean vector with, at most, the last two + // bits. In Arrow, Boolean values are LSB packed + uint8_t src = 0; + for (size_t i = 0; i < data.size(); ++i) + src |= (data[i] << i); + + // Allocate a single byte to copy the bits into + size_t sz = 1; + dst = malloc(sz); + std::memcpy((void *)dst, &src, sz); + + return std::pair(dst, data.size()); + } + case TILEDB_INT8: { + auto data = enmr.as_vector(); + return std::pair(_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT8: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_INT16: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT16: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_INT32: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT32: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_INT64: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT64: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_FLOAT32: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_FLOAT64: { + auto data = enmr.as_vector(); + return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + default: + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Unsupported TileDB dict datatype: {} ", + tiledb::impl::type_to_str(enmr.type()))); + } } bool ArrowAdapter::_isstr(const char* format) { @@ -306,294 +283,287 @@ bool ArrowAdapter::_isstr(const char* format) { return false; } -inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { - if (ec != NANOARROW_OK) - throw TileDBSOMAError( - fmt::format("ArrowAdapter: Arrow Error {} ", msg)); +inline void exitIfError(const ArrowErrorCode ec, const std::string &msg) { + if (ec != NANOARROW_OK) + throw TileDBSOMAError(fmt::format("ArrowAdapter: Arrow Error {} ", msg)); } std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { - std::unique_ptr schema = std::make_unique(); - std::unique_ptr array = std::make_unique(); - auto sch = schema.get(); - auto arr = array.get(); - - auto coltype = to_arrow_format(column->type()).data(); - auto natype = to_nanoarrow_type(coltype); - exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); - exitIfError( - ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); - exitIfError( - ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); - - schema->format = strdup(to_arrow_format(column->type()).data()); - schema->name = strdup(column->name().data()); - schema->metadata = nullptr; - schema->flags = 0; - schema->n_children = 0; - schema->children = nullptr; - schema->dictionary = nullptr; - schema->release = &release_schema; - schema->private_data = nullptr; - - // this will be 3 for char vecs and 2 for enumerations - int n_buffers = column->is_var() ? 3 : 2; - - // Create an ArrowBuffer to manage the lifetime of `column`. - // - `arrow_buffer` holds shared_ptr to `column`, increments - // the use count and keeps the ColumnBuffer data alive. - // - When the arrow array is released, `array->release()` is - // called with `arrow_buffer` in `private_data`. - // `arrow_buffer` is deleted, which decrements the the - // `column` use count. When the `column` use count reaches - // 0, the ColumnBuffer data will be deleted. - auto arrow_buffer = new ArrowBuffer(column); - - exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); - exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); - array->length = column->size(); - - LOG_TRACE(fmt::format( - "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", - to_arrow_format(column->type()).data(), - column->name().data(), - n_buffers, - array->n_buffers, - column->is_nullable())); - - array->null_count = 0; - array->offset = 0; - array->n_buffers = n_buffers; - array->n_children = 0; - array->buffers = nullptr; - array->children = nullptr; - array->dictionary = nullptr; - array->release = &release_array; - array->private_data = (void*)arrow_buffer; - - LOG_TRACE(fmt::format( - "[ArrowAdapter] create array name='{}' use_count={}", - column->name(), - column.use_count())); - - array->buffers = (const void**)malloc( - sizeof(void*) * n_buffers); // new const void*[n_buffers]; - assert(array->buffers != nullptr); - array->buffers[0] = nullptr; // validity addressed below - array->buffers[n_buffers - 1] = column->data().data(); // data - if (n_buffers == 3) { - array->buffers[1] = column->offsets().data(); // offsets - } - - if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; // it is also set by default - - // Count nulls - for (auto v : column->validity()) { - array->null_count += v == 0; - } - - // Convert validity bytemap to a bitmap in place - column->validity_to_bitmap(); - array->buffers[0] = column->validity().data(); - } else { - schema->flags = 0; // as ArrowSchemaInitFromType leads to NULLABLE set + std::unique_ptr schema = std::make_unique(); + std::unique_ptr array = std::make_unique(); + auto sch = schema.get(); + auto arr = array.get(); + + auto coltype = to_arrow_format(column->type()).data(); + auto natype = to_nanoarrow_type(coltype); + exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, column->name().data()), + "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); + + schema->format = strdup(to_arrow_format(column->type()).data()); + schema->name = strdup(column->name().data()); + schema->metadata = nullptr; + schema->flags = 0; + schema->n_children = 0; + schema->children = nullptr; + schema->dictionary = nullptr; + schema->release = &release_schema; + schema->private_data = nullptr; + + // this will be 3 for char vecs and 2 for enumerations + int n_buffers = column->is_var() ? 3 : 2; + + // Create an ArrowBuffer to manage the lifetime of `column`. + // - `arrow_buffer` holds shared_ptr to `column`, increments + // the use count and keeps the ColumnBuffer data alive. + // - When the arrow array is released, `array->release()` is + // called with `arrow_buffer` in `private_data`. + // `arrow_buffer` is deleted, which decrements the the + // `column` use count. When the `column` use count reaches + // 0, the ColumnBuffer data will be deleted. + auto arrow_buffer = new ArrowBuffer(column); + + exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); + array->length = column->size(); + + LOG_TRACE(fmt::format( + "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", + to_arrow_format(column->type()).data(), column->name().data(), n_buffers, + array->n_buffers, column->is_nullable())); + + array->null_count = 0; + array->offset = 0; + array->n_buffers = n_buffers; + array->n_children = 0; + array->buffers = nullptr; + array->children = nullptr; + array->dictionary = nullptr; + array->release = &release_array; + array->private_data = (void *)arrow_buffer; + + LOG_TRACE(fmt::format("[ArrowAdapter] create array name='{}' use_count={}", + column->name(), column.use_count())); + + array->buffers = (const void **)malloc( + sizeof(void *) * n_buffers); // new const void*[n_buffers]; + assert(array->buffers != nullptr); + array->buffers[0] = nullptr; // validity addressed below + array->buffers[n_buffers - 1] = column->data().data(); // data + if (n_buffers == 3) { + array->buffers[1] = column->offsets().data(); // offsets + } + + if (column->is_nullable()) { + schema->flags |= ARROW_FLAG_NULLABLE; // it is also set by default + + // Count nulls + for (auto v : column->validity()) { + array->null_count += v == 0; } - if (column->is_ordered()) { - schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; + // Convert validity bytemap to a bitmap in place + column->validity_to_bitmap(); + array->buffers[0] = column->validity().data(); + } else { + schema->flags = 0; // as ArrowSchemaInitFromType leads to NULLABLE set + } + + if (column->is_ordered()) { + schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; + } + + // Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean + if (column->type() == TILEDB_BOOL) { + column->data_to_bitmap(); + } + + // Workaround for date + if (column->type() == TILEDB_DATETIME_DAY) { + // TODO: Put in ColumnBuffer + size_t n = array->length; + std::vector indata(n); + std::memcpy(indata.data(), column->data().data(), + sizeof(int64_t) * n); + std::vector vec(n); + for (size_t i = 0; i < n; i++) { + vec[i] = static_cast(indata[i]); } - - // Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean - if (column->type() == TILEDB_BOOL) { - column->data_to_bitmap(); + std::memcpy((void *)array->buffers[n_buffers - 1], vec.data(), + sizeof(int32_t) * n); + } + + if (column->has_enumeration()) { + auto dict_sch = + (ArrowSchema *)malloc(sizeof(ArrowSchema)); // new ArrowSchema; + auto dict_arr = (ArrowArray *)malloc(sizeof(ArrowArray)); // new ArrowArray; + + auto enmr = column->get_enumeration_info(); + auto dcoltype = to_arrow_format(enmr->type(), false).data(); + auto dnatype = to_nanoarrow_type(dcoltype); + + exitIfError(ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); + exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); + exitIfError(ArrowSchemaAllocateChildren(dict_sch, 0), + "Bad schema children alloc"); + dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); + dict_sch->name = nullptr; + dict_sch->metadata = nullptr; + dict_sch->flags = 0; + dict_sch->n_children = 0; + dict_sch->children = nullptr; + dict_sch->dictionary = nullptr; + dict_sch->release = &release_schema; + dict_sch->private_data = nullptr; + + exitIfError(ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(dict_arr, 0), + "Bad array children alloc"); + const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; + dict_arr->null_count = 0; + dict_arr->offset = 0; + dict_arr->n_buffers = n_buf; + dict_arr->n_children = 0; + dict_arr->buffers = nullptr; + dict_arr->children = nullptr; + dict_arr->dictionary = nullptr; + dict_arr->release = &release_array; + dict_arr->private_data = nullptr; + dict_arr->buffers = (const void **)malloc(sizeof(void *) * n_buf); + dict_arr->buffers[0] = nullptr; // validity: none here + + // TODO string types currently get the data and offset + // buffers from ColumnBuffer::enum_offsets and + // ColumnBuffer::enum_string which is retrieved via + // ColumnBuffer::convert_enumeration. This may be refactored + // to all use ColumnBuffer::get_enumeration_info. Note that + // ColumnBuffer::has_enumeration may also be removed in a + // future refactor as ColumnBuffer::get_enumeration_info + // returns std::optional where std::nullopt indicates the + // column does not contain enumerated values. + if (enmr->type() == TILEDB_STRING_ASCII or + enmr->type() == TILEDB_STRING_UTF8) { + auto dict_vec = enmr->as_vector(); + column->convert_enumeration(); + dict_arr->buffers[1] = column->enum_offsets().data(); + dict_arr->buffers[2] = column->enum_string().data(); + dict_arr->length = dict_vec.size(); + } else { + auto [dict_data, dict_length] = + _get_data_and_length(*enmr, dict_arr->buffers[1]); + dict_arr->buffers[1] = dict_data; + dict_arr->length = dict_length; } - // Workaround for date - if (column->type() == TILEDB_DATETIME_DAY) { - // TODO: Put in ColumnBuffer - size_t n = array->length; - std::vector indata(n); - std::memcpy( - indata.data(), column->data().data(), sizeof(int64_t) * n); - std::vector vec(n); - for (size_t i = 0; i < n; i++) { - vec[i] = static_cast(indata[i]); - } - std::memcpy( - (void*)array->buffers[n_buffers - 1], - vec.data(), - sizeof(int32_t) * n); - } + schema->dictionary = dict_sch; + array->dictionary = dict_arr; + } - if (column->has_enumeration()) { - auto dict_sch = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); // new ArrowSchema; - auto dict_arr = (ArrowArray*)malloc( - sizeof(ArrowArray)); // new ArrowArray; - - auto enmr = column->get_enumeration_info(); - auto dcoltype = to_arrow_format(enmr->type(), false).data(); - auto dnatype = to_nanoarrow_type(dcoltype); - - exitIfError( - ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); - exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); - exitIfError( - ArrowSchemaAllocateChildren(dict_sch, 0), - "Bad schema children alloc"); - dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); - dict_sch->name = nullptr; - dict_sch->metadata = nullptr; - dict_sch->flags = 0; - dict_sch->n_children = 0; - dict_sch->children = nullptr; - dict_sch->dictionary = nullptr; - dict_sch->release = &release_schema; - dict_sch->private_data = nullptr; - - exitIfError( - ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); - exitIfError( - ArrowArrayAllocateChildren(dict_arr, 0), - "Bad array children alloc"); - dict_arr->release = &release_array; - const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; - dict_arr->private_data = nullptr; - dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); - dict_arr->buffers[0] = nullptr; // validity: none here - - // TODO string types currently get the data and offset - // buffers from ColumnBuffer::enum_offsets and - // ColumnBuffer::enum_string which is retrieved via - // ColumnBuffer::convert_enumeration. This may be refactored - // to all use ColumnBuffer::get_enumeration_info. Note that - // ColumnBuffer::has_enumeration may also be removed in a - // future refactor as ColumnBuffer::get_enumeration_info - // returns std::optional where std::nullopt indicates the - // column does not contain enumerated values. - if (enmr->type() == TILEDB_STRING_ASCII or - enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) { - auto dict_vec = enmr->as_vector(); - column->convert_enumeration(); - dict_arr->buffers[1] = column->enum_offsets().data(); - dict_arr->buffers[2] = column->enum_string().data(); - dict_arr->length = dict_vec.size(); - } else { - auto [dict_data, dict_length] = _get_data_and_length( - *enmr, dict_arr->buffers[1]); - dict_arr->buffers[1] = dict_data; - dict_arr->length = dict_length; - } - - schema->dictionary = dict_sch; - array->dictionary = dict_arr; - } - - return std::pair(std::move(array), std::move(schema)); + return std::pair(std::move(array), std::move(schema)); } -std::string_view ArrowAdapter::to_arrow_format( - tiledb_datatype_t datatype, bool use_large) { - switch (datatype) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - return use_large ? "U" : "u"; // large because TileDB - // uses 64bit offsets - case TILEDB_CHAR: - case TILEDB_BLOB: - return use_large ? "Z" : "z"; // large because TileDB - // uses 64bit offsets - case TILEDB_BOOL: - return "b"; - case TILEDB_INT32: - return "i"; - case TILEDB_INT64: - return "l"; - case TILEDB_FLOAT32: - return "f"; - case TILEDB_FLOAT64: - return "g"; - case TILEDB_INT8: - return "c"; - case TILEDB_UINT8: - return "C"; - case TILEDB_INT16: - return "s"; - case TILEDB_UINT16: - return "S"; - case TILEDB_UINT32: - return "I"; - case TILEDB_UINT64: - return "L"; - case TILEDB_TIME_SEC: - return "tts"; - case TILEDB_TIME_MS: - return "ttm"; - case TILEDB_TIME_US: - return "ttu"; - case TILEDB_TIME_NS: - return "ttn"; - case TILEDB_DATETIME_DAY: - return "tdD"; - case TILEDB_DATETIME_SEC: - return "tss:"; - case TILEDB_DATETIME_MS: - return "tsm:"; - case TILEDB_DATETIME_US: - return "tsu:"; - case TILEDB_DATETIME_NS: - return "tsn:"; - default: - break; - } - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype: {} ", - tiledb::impl::type_to_str(datatype))); +std::string_view ArrowAdapter::to_arrow_format(tiledb_datatype_t datatype, + bool use_large) { + switch (datatype) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + return use_large ? "U" : "u"; // large because TileDB + // uses 64bit offsets + case TILEDB_CHAR: + case TILEDB_BLOB: + return use_large ? "Z" : "z"; // large because TileDB + // uses 64bit offsets + case TILEDB_BOOL: + return "b"; + case TILEDB_INT32: + return "i"; + case TILEDB_INT64: + return "l"; + case TILEDB_FLOAT32: + return "f"; + case TILEDB_FLOAT64: + return "g"; + case TILEDB_INT8: + return "c"; + case TILEDB_UINT8: + return "C"; + case TILEDB_INT16: + return "s"; + case TILEDB_UINT16: + return "S"; + case TILEDB_UINT32: + return "I"; + case TILEDB_UINT64: + return "L"; + case TILEDB_TIME_SEC: + return "tts"; + case TILEDB_TIME_MS: + return "ttm"; + case TILEDB_TIME_US: + return "ttu"; + case TILEDB_TIME_NS: + return "ttn"; + case TILEDB_DATETIME_DAY: + return "tdD"; + case TILEDB_DATETIME_SEC: + return "tss:"; + case TILEDB_DATETIME_MS: + return "tsm:"; + case TILEDB_DATETIME_US: + return "tsu:"; + case TILEDB_DATETIME_NS: + return "tsn:"; + default: + break; + } + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Unsupported TileDB datatype: {} ", + tiledb::impl::type_to_str(datatype))); } // FIXME: Add more types, maybe make it a map enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { - if (sv == "i") - return NANOARROW_TYPE_INT32; - else if (sv == "c") - return NANOARROW_TYPE_INT8; - else if (sv == "C") - return NANOARROW_TYPE_UINT8; - else if (sv == "s") - return NANOARROW_TYPE_INT16; - else if (sv == "S") - return NANOARROW_TYPE_UINT16; - else if (sv == "I") - return NANOARROW_TYPE_UINT32; - else if (sv == "l") - return NANOARROW_TYPE_INT64; - else if (sv == "L") - return NANOARROW_TYPE_UINT64; - else if (sv == "f") - return NANOARROW_TYPE_FLOAT; - else if (sv == "g") - return NANOARROW_TYPE_DOUBLE; - else if (sv == "u") - return NANOARROW_TYPE_STRING; - else if (sv == "U") - return NANOARROW_TYPE_LARGE_STRING; - else if (sv == "b") - return NANOARROW_TYPE_BOOL; - else if (sv == "tss:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "tsm:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "tdD") - return NANOARROW_TYPE_DOUBLE; // R Date: fractional days since epoch - else if (sv == "z") - return NANOARROW_TYPE_BINARY; - else if (sv == "Z") - return NANOARROW_TYPE_LARGE_BINARY; - else - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); + if (sv == "i") + return NANOARROW_TYPE_INT32; + else if (sv == "c") + return NANOARROW_TYPE_INT8; + else if (sv == "C") + return NANOARROW_TYPE_UINT8; + else if (sv == "s") + return NANOARROW_TYPE_INT16; + else if (sv == "S") + return NANOARROW_TYPE_UINT16; + else if (sv == "I") + return NANOARROW_TYPE_UINT32; + else if (sv == "l") + return NANOARROW_TYPE_INT64; + else if (sv == "L") + return NANOARROW_TYPE_UINT64; + else if (sv == "f") + return NANOARROW_TYPE_FLOAT; + else if (sv == "g") + return NANOARROW_TYPE_DOUBLE; + else if (sv == "u") + return NANOARROW_TYPE_STRING; + else if (sv == "U") + return NANOARROW_TYPE_LARGE_STRING; + else if (sv == "b") + return NANOARROW_TYPE_BOOL; + else if (sv == "tss:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsm:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tdD") + return NANOARROW_TYPE_DOUBLE; // R Date: fractional days since epoch + else if (sv == "z") + return NANOARROW_TYPE_BINARY; + else if (sv == "Z") + return NANOARROW_TYPE_LARGE_BINARY; + else + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); } -} // namespace tiledbsoma +} // namespace tiledbsoma From 9ab125c6019a2edd07a28a970d066608a0b8c6e4 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 19 Mar 2024 15:40:48 -0500 Subject: [PATCH 20/39] Remove initialization setters covered by nanoarrow use --- libtiledbsoma/src/utils/arrow_adapter.cc | 41 ++++-------------------- 1 file changed, 6 insertions(+), 35 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index b2c810f876..c79e83aec9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -301,16 +301,9 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError(ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); exitIfError(ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); - - schema->format = strdup(to_arrow_format(column->type()).data()); - schema->name = strdup(column->name().data()); - schema->metadata = nullptr; - schema->flags = 0; - schema->n_children = 0; - schema->children = nullptr; - schema->dictionary = nullptr; + // After allocating and initializing via nanoarrow we + // hook our custom release function in schema->release = &release_schema; - schema->private_data = nullptr; // this will be 3 for char vecs and 2 for enumerations int n_buffers = column->is_var() ? 3 : 2; @@ -334,21 +327,15 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { to_arrow_format(column->type()).data(), column->name().data(), n_buffers, array->n_buffers, column->is_nullable())); - array->null_count = 0; - array->offset = 0; - array->n_buffers = n_buffers; - array->n_children = 0; - array->buffers = nullptr; - array->children = nullptr; - array->dictionary = nullptr; + // After allocating and initializing via nanoarrow we + // hook our custom release function in array->release = &release_array; array->private_data = (void *)arrow_buffer; LOG_TRACE(fmt::format("[ArrowAdapter] create array name='{}' use_count={}", column->name(), column.use_count())); - array->buffers = (const void **)malloc( - sizeof(void *) * n_buffers); // new const void*[n_buffers]; + array->buffers = (const void **)malloc(sizeof(void *) * n_buffers); assert(array->buffers != nullptr); array->buffers[0] = nullptr; // validity addressed below array->buffers[n_buffers - 1] = column->data().data(); // data @@ -408,31 +395,15 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); exitIfError(ArrowSchemaAllocateChildren(dict_sch, 0), "Bad schema children alloc"); - dict_sch->format = strdup(to_arrow_format(enmr->type(), false).data()); - dict_sch->name = nullptr; - dict_sch->metadata = nullptr; - dict_sch->flags = 0; - dict_sch->n_children = 0; - dict_sch->children = nullptr; - dict_sch->dictionary = nullptr; dict_sch->release = &release_schema; - dict_sch->private_data = nullptr; exitIfError(ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); exitIfError(ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; - dict_arr->null_count = 0; - dict_arr->offset = 0; - dict_arr->n_buffers = n_buf; - dict_arr->n_children = 0; - dict_arr->buffers = nullptr; - dict_arr->children = nullptr; - dict_arr->dictionary = nullptr; - dict_arr->release = &release_array; - dict_arr->private_data = nullptr; dict_arr->buffers = (const void **)malloc(sizeof(void *) * n_buf); dict_arr->buffers[0] = nullptr; // validity: none here + dict_arr->release = &release_array; // TODO string types currently get the data and offset // buffers from ColumnBuffer::enum_offsets and From bd4ed2a622daf52419d1ee556c3e35883b61d60c Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 19 Mar 2024 21:33:00 -0500 Subject: [PATCH 21/39] Ensure DATETIME columns get Arrow coltype reset --- libtiledbsoma/src/utils/arrow_adapter.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index c79e83aec9..c215beafe2 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -367,8 +367,16 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { column->data_to_bitmap(); } + // Workaround for datetime + if (column->type() == TILEDB_DATETIME_MS || column->type() == TILEDB_DATETIME_SEC) { + free((void*)schema->format); // free the 'storage' format + schema->format = strdup(to_arrow_format(column->type()).data()); + } + // Workaround for date if (column->type() == TILEDB_DATETIME_DAY) { + free((void*)schema->format); // free the 'storage' format + schema->format = strdup(to_arrow_format(column->type()).data()); // TODO: Put in ColumnBuffer size_t n = array->length; std::vector indata(n); @@ -383,9 +391,8 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } if (column->has_enumeration()) { - auto dict_sch = - (ArrowSchema *)malloc(sizeof(ArrowSchema)); // new ArrowSchema; - auto dict_arr = (ArrowArray *)malloc(sizeof(ArrowArray)); // new ArrowArray; + auto dict_sch = (ArrowSchema *)malloc(sizeof(ArrowSchema)); + auto dict_arr = (ArrowArray *)malloc(sizeof(ArrowArray)); auto enmr = column->get_enumeration_info(); auto dcoltype = to_arrow_format(enmr->type(), false).data(); From 7f999bae4531f9198d54124543859b6d1024f657 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 21 Mar 2024 15:41:49 -0500 Subject: [PATCH 22/39] Add more date and datetime support --- apis/r/R/utils-arrow.R | 6 ++++-- libtiledbsoma/src/utils/arrow_adapter.cc | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/apis/r/R/utils-arrow.R b/apis/r/R/utils-arrow.R index cda8448b05..55832d226e 100644 --- a/apis/r/R/utils-arrow.R +++ b/apis/r/R/utils-arrow.R @@ -66,12 +66,14 @@ tiledb_type_from_arrow_type <- function(x, is_dim) { utf8 = "UTF8", string = "UTF8", large_utf8 = "UTF8", - # date32 = "date32", + # based on what TileDB supports + date32 = "DATETIME_DAY", # date64 = "date64", # time32 = "time32", # time64 = "time64", # null = "null", - # timestamp = "timestamp", + # based on what TileDB supports with a default msec res. + timestamp = "DATETIME_MS", # decimal128 = "decimal128", # decimal256 = "decimal256", # struct = "struct", diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index c215beafe2..d9af8bd12c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -368,7 +368,9 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } // Workaround for datetime - if (column->type() == TILEDB_DATETIME_MS || column->type() == TILEDB_DATETIME_SEC) { + if (column->type() == TILEDB_DATETIME_SEC || + column->type() == TILEDB_DATETIME_MS || + column->type() == TILEDB_DATETIME_NS) { free((void*)schema->format); // free the 'storage' format schema->format = strdup(to_arrow_format(column->type()).data()); } @@ -533,8 +535,10 @@ enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { return NANOARROW_TYPE_INT64; // NB time resolution set indepedently else if (sv == "tsm:") return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsn:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently else if (sv == "tdD") - return NANOARROW_TYPE_DOUBLE; // R Date: fractional days since epoch + return NANOARROW_TYPE_INT32; // R Date: fractional days since epoch else if (sv == "z") return NANOARROW_TYPE_BINARY; else if (sv == "Z") From 48c3816fd1e7c243fff66b1a0950774187e5dd7c Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 26 Mar 2024 17:59:37 -0500 Subject: [PATCH 23/39] Additional conversion --- libtiledbsoma/src/utils/arrow_adapter.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index d9af8bd12c..c79c3bb652 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -537,6 +537,8 @@ enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { return NANOARROW_TYPE_INT64; // NB time resolution set indepedently else if (sv == "tsn:") return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsu:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently else if (sv == "tdD") return NANOARROW_TYPE_INT32; // R Date: fractional days since epoch else if (sv == "z") From cab5bf0e05de57eaa1f99f83373aff3283f4477a Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Wed, 27 Mar 2024 07:35:46 -0500 Subject: [PATCH 24/39] Post-rebase change --- libtiledbsoma/src/utils/arrow_adapter.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index c79c3bb652..39034e745e 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -191,6 +191,13 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( auto dict = (ArrowSchema *)malloc(sizeof(ArrowSchema)); dict->format = strdup(ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + if (enmr.type() == TILEDB_STRING_ASCII or + enmr.type() == TILEDB_CHAR) { + dict->format = strdup("z"); + } else { + dict->format = strdup( + ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + } dict->name = strdup(enmr.name().c_str()); dict->metadata = nullptr; dict->flags = 0; @@ -409,7 +416,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError(ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); exitIfError(ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); - const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; + const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == 0 ? 3 : 2; dict_arr->buffers = (const void **)malloc(sizeof(void *) * n_buf); dict_arr->buffers[0] = nullptr; // validity: none here dict_arr->release = &release_array; @@ -424,7 +431,8 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { // returns std::optional where std::nullopt indicates the // column does not contain enumerated values. if (enmr->type() == TILEDB_STRING_ASCII or - enmr->type() == TILEDB_STRING_UTF8) { + enmr->type() == TILEDB_STRING_UTF8 or + enmr->type() == TILEDB_CHAR) { auto dict_vec = enmr->as_vector(); column->convert_enumeration(); dict_arr->buffers[1] = column->enum_offsets().data(); From ee445f95f2d54d55afc0aa5d687a0d658d05a1e0 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Wed, 27 Mar 2024 07:46:37 -0500 Subject: [PATCH 25/39] Heeding time to the lord of linting is time well spent some say --- libtiledbsoma/src/utils/arrow_adapter.cc | 980 ++++++++++++----------- 1 file changed, 503 insertions(+), 477 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 39034e745e..a4ca011af6 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -38,248 +38,262 @@ namespace tiledbsoma { using namespace tiledb; -void ArrowAdapter::release_schema(struct ArrowSchema *schema) { - if (schema->name != nullptr) - LOG_DEBUG( - fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); - - if (schema->name != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->name"); - free((void *)schema->name); - schema->name = nullptr; - } - if (schema->format != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->format"); - free((void *)schema->format); - schema->format = nullptr; - } - if (schema->metadata != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->metadata"); - free((void *)schema->metadata); - schema->metadata = nullptr; - } - - if (schema->children != nullptr) { - for (auto i = 0; i < schema->n_children; i++) { - if (schema->children[i] != nullptr) { - if (schema->children[i]->release != nullptr) { - LOG_TRACE( - fmt::format("[ArrowAdapter] release_schema schema->child {} " - "release", - i)); - release_schema(schema->children[i]); +void ArrowAdapter::release_schema(struct ArrowSchema* schema) { + if (schema->name != nullptr) + LOG_DEBUG( + fmt::format("[ArrowAdapter] release_schema for {}", schema->name)); + + if (schema->name != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->name"); + free((void*)schema->name); + schema->name = nullptr; + } + if (schema->format != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->format"); + free((void*)schema->format); + schema->format = nullptr; + } + if (schema->metadata != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->metadata"); + free((void*)schema->metadata); + schema->metadata = nullptr; + } + + if (schema->children != nullptr) { + for (auto i = 0; i < schema->n_children; i++) { + if (schema->children[i] != nullptr) { + if (schema->children[i]->release != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} " + "release", + i)); + release_schema(schema->children[i]); + } + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema schema->child {} free", i)); + free(schema->children[i]); + } } - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema schema->child {} free", i)); - free(schema->children[i]); - } + LOG_TRACE("[ArrowAdapter] release_schema schema->children"); + free(schema->children); + schema->children = nullptr; } - LOG_TRACE("[ArrowAdapter] release_schema schema->children"); - free(schema->children); - schema->children = nullptr; - } - - if (schema->dictionary != nullptr) { - if (schema->dictionary->release != nullptr) { - LOG_TRACE("[ArrowAdapter] release_schema schema->dict release"); - release_schema(schema->dictionary); + + if (schema->dictionary != nullptr) { + if (schema->dictionary->release != nullptr) { + LOG_TRACE("[ArrowAdapter] release_schema schema->dict release"); + release_schema(schema->dictionary); + } + LOG_TRACE("[ArrowAdapter] release_schema schema->dict free"); + free(schema->dictionary); + schema->dictionary = nullptr; } - LOG_TRACE("[ArrowAdapter] release_schema schema->dict free"); - free(schema->dictionary); - schema->dictionary = nullptr; - } - schema->release = nullptr; - LOG_TRACE("[ArrowAdapter] release_schema done"); + schema->release = nullptr; + LOG_TRACE("[ArrowAdapter] release_schema done"); } -void ArrowAdapter::release_array(struct ArrowArray *array) { - auto arrow_buffer = static_cast(array->private_data); - LOG_TRACE(fmt::format("[ArrowAdapter] release_array {} use_count={}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count())); - - // Delete the ArrowBuffer, which was allocated with new. - // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the - // underlying ColumnBuffer, the ColumnBuffer will be deleted. - delete arrow_buffer; - - if (array->buffers != nullptr) { - delete[] array->buffers; - array->buffers = nullptr; - } - - if (array->children != nullptr) { - for (auto i = 0; i < array->n_children; i++) { - if (array->children[i] != nullptr) { - if (array->children[i]->release != nullptr) { - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema array->child {} release", i)); - release_array(array->children[i]); +void ArrowAdapter::release_array(struct ArrowArray* array) { + auto arrow_buffer = static_cast(array->private_data); + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_array {} use_count={}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count())); + + // Delete the ArrowBuffer, which was allocated with new. + // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the + // underlying ColumnBuffer, the ColumnBuffer will be deleted. + delete arrow_buffer; + + if (array->buffers != nullptr) { + delete[] array->buffers; + array->buffers = nullptr; + } + + if (array->children != nullptr) { + for (auto i = 0; i < array->n_children; i++) { + if (array->children[i] != nullptr) { + if (array->children[i]->release != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} release", + i)); + release_array(array->children[i]); + } + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_schema array->child {} free", i)); + free(array->children[i]); + } } - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_schema array->child {} free", i)); - free(array->children[i]); - } + LOG_TRACE("[ArrowAdapter] release_array array->children"); + free(array->children); + array->children = nullptr; + } + + if (array->dictionary != nullptr) { + // TODO: This can lead to segfault on some data sets, could be caused + // by how we fill arrow data structures. This should pass. + // if (array->dictionary->release != nullptr) { + // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); + // release_array(array->dictionary); + //} + LOG_TRACE("[ArrowAdapter] release_array array->dict free"); + free(array->dictionary); + array->dictionary = nullptr; } - LOG_TRACE("[ArrowAdapter] release_array array->children"); - free(array->children); - array->children = nullptr; - } - - if (array->dictionary != nullptr) { - // TODO: This can lead to segfault on some data sets, could be caused - // by how we fill arrow data structures. This should pass. - // if (array->dictionary->release != nullptr) { - // LOG_TRACE("[ArrowAdapter] release_array array->dict release"); - // release_array(array->dictionary); - //} - LOG_TRACE("[ArrowAdapter] release_array array->dict free"); - free(array->dictionary); - array->dictionary = nullptr; - } - - array->release = nullptr; - LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); + + array->release = nullptr; + LOG_TRACE(fmt::format("[ArrowAdapter] release_array done")); } std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array) { - auto tiledb_schema = tiledb_array->schema(); - auto ndim = tiledb_schema.domain().ndim(); - auto nattr = tiledb_schema.attribute_num(); - - std::unique_ptr arrow_schema = std::make_unique(); - arrow_schema->format = strdup("+s"); - arrow_schema->n_children = ndim + nattr; - arrow_schema->release = &ArrowAdapter::release_schema; - arrow_schema->children = - (ArrowSchema **)malloc(arrow_schema->n_children * sizeof(ArrowSchema *)); - - ArrowSchema *child = nullptr; - - for (uint32_t i = 0; i < ndim; ++i) { - auto dim = tiledb_schema.domain().dimension(i); - child = arrow_schema->children[i] = - (ArrowSchema *)malloc(sizeof(ArrowSchema)); - child->format = strdup(ArrowAdapter::to_arrow_format(dim.type()).data()); - child->name = strdup(dim.name().c_str()); - child->metadata = nullptr; - child->flags = 0; - child->n_children = 0; - child->dictionary = nullptr; - child->children = nullptr; - child->release = &ArrowAdapter::release_schema; - } - - for (uint32_t i = 0; i < nattr; ++i) { - auto attr = tiledb_schema.attribute(i); - child = arrow_schema->children[ndim + i] = - (ArrowSchema *)malloc(sizeof(ArrowSchema)); - child->format = strdup(ArrowAdapter::to_arrow_format(attr.type()).data()); - child->name = strdup(attr.name().c_str()); - child->metadata = nullptr; - child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; - child->n_children = 0; - child->children = nullptr; - child->dictionary = nullptr; - - auto enmr_name = AttributeExperimental::get_enumeration_name(*ctx, attr); - if (enmr_name.has_value()) { - auto enmr = - ArrayExperimental::get_enumeration(*ctx, *tiledb_array, attr.name()); - auto dict = (ArrowSchema *)malloc(sizeof(ArrowSchema)); - dict->format = - strdup(ArrowAdapter::to_arrow_format(enmr.type(), false).data()); - if (enmr.type() == TILEDB_STRING_ASCII or - enmr.type() == TILEDB_CHAR) { - dict->format = strdup("z"); - } else { - dict->format = strdup( - ArrowAdapter::to_arrow_format(enmr.type(), false).data()); - } - dict->name = strdup(enmr.name().c_str()); - dict->metadata = nullptr; - dict->flags = 0; - dict->n_children = 0; - dict->children = nullptr; - dict->dictionary = nullptr; - dict->release = &ArrowAdapter::release_schema; - dict->private_data = nullptr; - child->dictionary = dict; + auto tiledb_schema = tiledb_array->schema(); + auto ndim = tiledb_schema.domain().ndim(); + auto nattr = tiledb_schema.attribute_num(); + + std::unique_ptr arrow_schema = std::make_unique(); + arrow_schema->format = strdup("+s"); + arrow_schema->n_children = ndim + nattr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->children = (ArrowSchema**)malloc( + arrow_schema->n_children * sizeof(ArrowSchema*)); + + ArrowSchema* child = nullptr; + + for (uint32_t i = 0; i < ndim; ++i) { + auto dim = tiledb_schema.domain().dimension(i); + child = arrow_schema->children[i] = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); + child->format = strdup( + ArrowAdapter::to_arrow_format(dim.type()).data()); + child->name = strdup(dim.name().c_str()); + child->metadata = nullptr; + child->flags = 0; + child->n_children = 0; + child->dictionary = nullptr; + child->children = nullptr; + child->release = &ArrowAdapter::release_schema; } - child->release = &ArrowAdapter::release_schema; - } - return arrow_schema; + for (uint32_t i = 0; i < nattr; ++i) { + auto attr = tiledb_schema.attribute(i); + child = arrow_schema->children[ndim + i] = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); + child->format = strdup( + ArrowAdapter::to_arrow_format(attr.type()).data()); + child->name = strdup(attr.name().c_str()); + child->metadata = nullptr; + child->flags = attr.nullable() ? ARROW_FLAG_NULLABLE : 0; + child->n_children = 0; + child->children = nullptr; + child->dictionary = nullptr; + + auto enmr_name = AttributeExperimental::get_enumeration_name( + *ctx, attr); + if (enmr_name.has_value()) { + auto enmr = ArrayExperimental::get_enumeration( + *ctx, *tiledb_array, attr.name()); + auto dict = (ArrowSchema*)malloc(sizeof(ArrowSchema)); + dict->format = strdup( + ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + if (enmr.type() == TILEDB_STRING_ASCII or + enmr.type() == TILEDB_CHAR) { + dict->format = strdup("z"); + } else { + dict->format = strdup( + ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + } + dict->name = strdup(enmr.name().c_str()); + dict->metadata = nullptr; + dict->flags = 0; + dict->n_children = 0; + dict->children = nullptr; + dict->dictionary = nullptr; + dict->release = &ArrowAdapter::release_schema; + dict->private_data = nullptr; + child->dictionary = dict; + } + child->release = &ArrowAdapter::release_schema; + } + + return arrow_schema; } -std::pair -ArrowAdapter::_get_data_and_length(Enumeration &enmr, const void *dst) { - switch (enmr.type()) { - case TILEDB_BOOL: { - // We must handle this specially because vector does - // not store elements contiguously in memory - auto data = enmr.as_vector(); - - // Represent the Boolean vector with, at most, the last two - // bits. In Arrow, Boolean values are LSB packed - uint8_t src = 0; - for (size_t i = 0; i < data.size(); ++i) - src |= (data[i] << i); - - // Allocate a single byte to copy the bits into - size_t sz = 1; - dst = malloc(sz); - std::memcpy((void *)dst, &src, sz); - - return std::pair(dst, data.size()); - } - case TILEDB_INT8: { - auto data = enmr.as_vector(); - return std::pair(_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT8: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_INT16: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT16: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_INT32: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT32: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_INT64: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_UINT64: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_FLOAT32: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - case TILEDB_FLOAT64: { - auto data = enmr.as_vector(); - return std::pair(ArrowAdapter::_fill_data_buffer(data, dst), data.size()); - } - default: - throw TileDBSOMAError( - fmt::format("ArrowAdapter: Unsupported TileDB dict datatype: {} ", - tiledb::impl::type_to_str(enmr.type()))); - } +std::pair ArrowAdapter::_get_data_and_length( + Enumeration& enmr, const void* dst) { + switch (enmr.type()) { + case TILEDB_BOOL: { + // We must handle this specially because vector does + // not store elements contiguously in memory + auto data = enmr.as_vector(); + + // Represent the Boolean vector with, at most, the last two + // bits. In Arrow, Boolean values are LSB packed + uint8_t src = 0; + for (size_t i = 0; i < data.size(); ++i) + src |= (data[i] << i); + + // Allocate a single byte to copy the bits into + size_t sz = 1; + dst = malloc(sz); + std::memcpy((void*)dst, &src, sz); + + return std::pair(dst, data.size()); + } + case TILEDB_INT8: { + auto data = enmr.as_vector(); + return std::pair(_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT8: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_INT16: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT16: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_INT32: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT32: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_INT64: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_UINT64: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_FLOAT32: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + case TILEDB_FLOAT64: { + auto data = enmr.as_vector(); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); + } + default: + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB dict datatype: {} ", + tiledb::impl::type_to_str(enmr.type()))); + } } bool ArrowAdapter::_isstr(const char* format) { @@ -290,272 +304,284 @@ bool ArrowAdapter::_isstr(const char* format) { return false; } -inline void exitIfError(const ArrowErrorCode ec, const std::string &msg) { - if (ec != NANOARROW_OK) - throw TileDBSOMAError(fmt::format("ArrowAdapter: Arrow Error {} ", msg)); +inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { + if (ec != NANOARROW_OK) + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Arrow Error {} ", msg)); } std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { - std::unique_ptr schema = std::make_unique(); - std::unique_ptr array = std::make_unique(); - auto sch = schema.get(); - auto arr = array.get(); - - auto coltype = to_arrow_format(column->type()).data(); - auto natype = to_nanoarrow_type(coltype); - exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); - exitIfError(ArrowSchemaSetName(sch, column->name().data()), - "Bad schema name"); - exitIfError(ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); - // After allocating and initializing via nanoarrow we - // hook our custom release function in - schema->release = &release_schema; - - // this will be 3 for char vecs and 2 for enumerations - int n_buffers = column->is_var() ? 3 : 2; - - // Create an ArrowBuffer to manage the lifetime of `column`. - // - `arrow_buffer` holds shared_ptr to `column`, increments - // the use count and keeps the ColumnBuffer data alive. - // - When the arrow array is released, `array->release()` is - // called with `arrow_buffer` in `private_data`. - // `arrow_buffer` is deleted, which decrements the the - // `column` use count. When the `column` use count reaches - // 0, the ColumnBuffer data will be deleted. - auto arrow_buffer = new ArrowBuffer(column); - - exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); - exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); - array->length = column->size(); - - LOG_TRACE(fmt::format( - "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", - to_arrow_format(column->type()).data(), column->name().data(), n_buffers, - array->n_buffers, column->is_nullable())); - - // After allocating and initializing via nanoarrow we - // hook our custom release function in - array->release = &release_array; - array->private_data = (void *)arrow_buffer; - - LOG_TRACE(fmt::format("[ArrowAdapter] create array name='{}' use_count={}", - column->name(), column.use_count())); - - array->buffers = (const void **)malloc(sizeof(void *) * n_buffers); - assert(array->buffers != nullptr); - array->buffers[0] = nullptr; // validity addressed below - array->buffers[n_buffers - 1] = column->data().data(); // data - if (n_buffers == 3) { - array->buffers[1] = column->offsets().data(); // offsets - } - - if (column->is_nullable()) { - schema->flags |= ARROW_FLAG_NULLABLE; // it is also set by default - - // Count nulls - for (auto v : column->validity()) { - array->null_count += v == 0; + std::unique_ptr schema = std::make_unique(); + std::unique_ptr array = std::make_unique(); + auto sch = schema.get(); + auto arr = array.get(); + + auto coltype = to_arrow_format(column->type()).data(); + auto natype = to_nanoarrow_type(coltype); + exitIfError(ArrowSchemaInitFromType(sch, natype), "Bad schema init"); + exitIfError( + ArrowSchemaSetName(sch, column->name().data()), "Bad schema name"); + exitIfError( + ArrowSchemaAllocateChildren(sch, 0), "Bad schema children alloc"); + // After allocating and initializing via nanoarrow we + // hook our custom release function in + schema->release = &release_schema; + + // this will be 3 for char vecs and 2 for enumerations + int n_buffers = column->is_var() ? 3 : 2; + + // Create an ArrowBuffer to manage the lifetime of `column`. + // - `arrow_buffer` holds shared_ptr to `column`, increments + // the use count and keeps the ColumnBuffer data alive. + // - When the arrow array is released, `array->release()` is + // called with `arrow_buffer` in `private_data`. + // `arrow_buffer` is deleted, which decrements the the + // `column` use count. When the `column` use count reaches + // 0, the ColumnBuffer data will be deleted. + auto arrow_buffer = new ArrowBuffer(column); + + exitIfError(ArrowArrayInitFromType(arr, natype), "Bad array init"); + exitIfError(ArrowArrayAllocateChildren(arr, 0), "Bad array children alloc"); + array->length = column->size(); + + LOG_TRACE(fmt::format( + "[ArrowAdapter] column type {} name {} nbuf {} {} nullable {}", + to_arrow_format(column->type()).data(), + column->name().data(), + n_buffers, + array->n_buffers, + column->is_nullable())); + + // After allocating and initializing via nanoarrow we + // hook our custom release function in + array->release = &release_array; + array->private_data = (void*)arrow_buffer; + + LOG_TRACE(fmt::format( + "[ArrowAdapter] create array name='{}' use_count={}", + column->name(), + column.use_count())); + + array->buffers = (const void**)malloc(sizeof(void*) * n_buffers); + assert(array->buffers != nullptr); + array->buffers[0] = nullptr; // validity addressed below + array->buffers[n_buffers - 1] = column->data().data(); // data + if (n_buffers == 3) { + array->buffers[1] = column->offsets().data(); // offsets } - // Convert validity bytemap to a bitmap in place - column->validity_to_bitmap(); - array->buffers[0] = column->validity().data(); - } else { - schema->flags = 0; // as ArrowSchemaInitFromType leads to NULLABLE set - } - - if (column->is_ordered()) { - schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; - } - - // Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean - if (column->type() == TILEDB_BOOL) { - column->data_to_bitmap(); - } - - // Workaround for datetime - if (column->type() == TILEDB_DATETIME_SEC || - column->type() == TILEDB_DATETIME_MS || - column->type() == TILEDB_DATETIME_NS) { - free((void*)schema->format); // free the 'storage' format - schema->format = strdup(to_arrow_format(column->type()).data()); - } - - // Workaround for date - if (column->type() == TILEDB_DATETIME_DAY) { - free((void*)schema->format); // free the 'storage' format - schema->format = strdup(to_arrow_format(column->type()).data()); - // TODO: Put in ColumnBuffer - size_t n = array->length; - std::vector indata(n); - std::memcpy(indata.data(), column->data().data(), - sizeof(int64_t) * n); - std::vector vec(n); - for (size_t i = 0; i < n; i++) { - vec[i] = static_cast(indata[i]); - } - std::memcpy((void *)array->buffers[n_buffers - 1], vec.data(), - sizeof(int32_t) * n); - } - - if (column->has_enumeration()) { - auto dict_sch = (ArrowSchema *)malloc(sizeof(ArrowSchema)); - auto dict_arr = (ArrowArray *)malloc(sizeof(ArrowArray)); - - auto enmr = column->get_enumeration_info(); - auto dcoltype = to_arrow_format(enmr->type(), false).data(); - auto dnatype = to_nanoarrow_type(dcoltype); - - exitIfError(ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); - exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); - exitIfError(ArrowSchemaAllocateChildren(dict_sch, 0), - "Bad schema children alloc"); - dict_sch->release = &release_schema; - - exitIfError(ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); - exitIfError(ArrowArrayAllocateChildren(dict_arr, 0), - "Bad array children alloc"); - const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == 0 ? 3 : 2; - dict_arr->buffers = (const void **)malloc(sizeof(void *) * n_buf); - dict_arr->buffers[0] = nullptr; // validity: none here - dict_arr->release = &release_array; - - // TODO string types currently get the data and offset - // buffers from ColumnBuffer::enum_offsets and - // ColumnBuffer::enum_string which is retrieved via - // ColumnBuffer::convert_enumeration. This may be refactored - // to all use ColumnBuffer::get_enumeration_info. Note that - // ColumnBuffer::has_enumeration may also be removed in a - // future refactor as ColumnBuffer::get_enumeration_info - // returns std::optional where std::nullopt indicates the - // column does not contain enumerated values. - if (enmr->type() == TILEDB_STRING_ASCII or - enmr->type() == TILEDB_STRING_UTF8 or - enmr->type() == TILEDB_CHAR) { - auto dict_vec = enmr->as_vector(); - column->convert_enumeration(); - dict_arr->buffers[1] = column->enum_offsets().data(); - dict_arr->buffers[2] = column->enum_string().data(); - dict_arr->length = dict_vec.size(); + if (column->is_nullable()) { + schema->flags |= ARROW_FLAG_NULLABLE; // it is also set by default + + // Count nulls + for (auto v : column->validity()) { + array->null_count += v == 0; + } + + // Convert validity bytemap to a bitmap in place + column->validity_to_bitmap(); + array->buffers[0] = column->validity().data(); } else { - auto [dict_data, dict_length] = - _get_data_and_length(*enmr, dict_arr->buffers[1]); - dict_arr->buffers[1] = dict_data; - dict_arr->length = dict_length; + schema->flags = 0; // as ArrowSchemaInitFromType leads to NULLABLE set + } + + if (column->is_ordered()) { + schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; + } + + // Workaround to cast TILEDB_BOOL from uint8 to 1-bit Arrow boolean + if (column->type() == TILEDB_BOOL) { + column->data_to_bitmap(); } - schema->dictionary = dict_sch; - array->dictionary = dict_arr; - } + // Workaround for datetime + if (column->type() == TILEDB_DATETIME_SEC || + column->type() == TILEDB_DATETIME_MS || + column->type() == TILEDB_DATETIME_NS) { + free((void*)schema->format); // free the 'storage' format + schema->format = strdup(to_arrow_format(column->type()).data()); + } + + // Workaround for date + if (column->type() == TILEDB_DATETIME_DAY) { + free((void*)schema->format); // free the 'storage' format + schema->format = strdup(to_arrow_format(column->type()).data()); + // TODO: Put in ColumnBuffer + size_t n = array->length; + std::vector indata(n); + std::memcpy( + indata.data(), column->data().data(), sizeof(int64_t) * n); + std::vector vec(n); + for (size_t i = 0; i < n; i++) { + vec[i] = static_cast(indata[i]); + } + std::memcpy( + (void*)array->buffers[n_buffers - 1], + vec.data(), + sizeof(int32_t) * n); + } + + if (column->has_enumeration()) { + auto dict_sch = (ArrowSchema*)malloc(sizeof(ArrowSchema)); + auto dict_arr = (ArrowArray*)malloc(sizeof(ArrowArray)); + + auto enmr = column->get_enumeration_info(); + auto dcoltype = to_arrow_format(enmr->type(), false).data(); + auto dnatype = to_nanoarrow_type(dcoltype); + + exitIfError( + ArrowSchemaInitFromType(dict_sch, dnatype), "Bad schema init"); + exitIfError(ArrowSchemaSetName(dict_sch, ""), "Bad schema name"); + exitIfError( + ArrowSchemaAllocateChildren(dict_sch, 0), + "Bad schema children alloc"); + dict_sch->release = &release_schema; + + exitIfError( + ArrowArrayInitFromType(dict_arr, dnatype), "Bad array init"); + exitIfError( + ArrowArrayAllocateChildren(dict_arr, 0), + "Bad array children alloc"); + const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == 0 ? 3 : 2; + dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); + dict_arr->buffers[0] = nullptr; // validity: none here + dict_arr->release = &release_array; + + // TODO string types currently get the data and offset + // buffers from ColumnBuffer::enum_offsets and + // ColumnBuffer::enum_string which is retrieved via + // ColumnBuffer::convert_enumeration. This may be refactored + // to all use ColumnBuffer::get_enumeration_info. Note that + // ColumnBuffer::has_enumeration may also be removed in a + // future refactor as ColumnBuffer::get_enumeration_info + // returns std::optional where std::nullopt indicates the + // column does not contain enumerated values. + if (enmr->type() == TILEDB_STRING_ASCII or + enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) { + auto dict_vec = enmr->as_vector(); + column->convert_enumeration(); + dict_arr->buffers[1] = column->enum_offsets().data(); + dict_arr->buffers[2] = column->enum_string().data(); + dict_arr->length = dict_vec.size(); + } else { + auto [dict_data, dict_length] = _get_data_and_length( + *enmr, dict_arr->buffers[1]); + dict_arr->buffers[1] = dict_data; + dict_arr->length = dict_length; + } - return std::pair(std::move(array), std::move(schema)); + schema->dictionary = dict_sch; + array->dictionary = dict_arr; + } + + return std::pair(std::move(array), std::move(schema)); } -std::string_view ArrowAdapter::to_arrow_format(tiledb_datatype_t datatype, - bool use_large) { - switch (datatype) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - return use_large ? "U" : "u"; // large because TileDB - // uses 64bit offsets - case TILEDB_CHAR: - case TILEDB_BLOB: - return use_large ? "Z" : "z"; // large because TileDB - // uses 64bit offsets - case TILEDB_BOOL: - return "b"; - case TILEDB_INT32: - return "i"; - case TILEDB_INT64: - return "l"; - case TILEDB_FLOAT32: - return "f"; - case TILEDB_FLOAT64: - return "g"; - case TILEDB_INT8: - return "c"; - case TILEDB_UINT8: - return "C"; - case TILEDB_INT16: - return "s"; - case TILEDB_UINT16: - return "S"; - case TILEDB_UINT32: - return "I"; - case TILEDB_UINT64: - return "L"; - case TILEDB_TIME_SEC: - return "tts"; - case TILEDB_TIME_MS: - return "ttm"; - case TILEDB_TIME_US: - return "ttu"; - case TILEDB_TIME_NS: - return "ttn"; - case TILEDB_DATETIME_DAY: - return "tdD"; - case TILEDB_DATETIME_SEC: - return "tss:"; - case TILEDB_DATETIME_MS: - return "tsm:"; - case TILEDB_DATETIME_US: - return "tsu:"; - case TILEDB_DATETIME_NS: - return "tsn:"; - default: - break; - } - throw TileDBSOMAError( - fmt::format("ArrowAdapter: Unsupported TileDB datatype: {} ", - tiledb::impl::type_to_str(datatype))); +std::string_view ArrowAdapter::to_arrow_format( + tiledb_datatype_t datatype, bool use_large) { + switch (datatype) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + return use_large ? "U" : "u"; // large because TileDB + // uses 64bit offsets + case TILEDB_CHAR: + case TILEDB_BLOB: + return use_large ? "Z" : "z"; // large because TileDB + // uses 64bit offsets + case TILEDB_BOOL: + return "b"; + case TILEDB_INT32: + return "i"; + case TILEDB_INT64: + return "l"; + case TILEDB_FLOAT32: + return "f"; + case TILEDB_FLOAT64: + return "g"; + case TILEDB_INT8: + return "c"; + case TILEDB_UINT8: + return "C"; + case TILEDB_INT16: + return "s"; + case TILEDB_UINT16: + return "S"; + case TILEDB_UINT32: + return "I"; + case TILEDB_UINT64: + return "L"; + case TILEDB_TIME_SEC: + return "tts"; + case TILEDB_TIME_MS: + return "ttm"; + case TILEDB_TIME_US: + return "ttu"; + case TILEDB_TIME_NS: + return "ttn"; + case TILEDB_DATETIME_DAY: + return "tdD"; + case TILEDB_DATETIME_SEC: + return "tss:"; + case TILEDB_DATETIME_MS: + return "tsm:"; + case TILEDB_DATETIME_US: + return "tsu:"; + case TILEDB_DATETIME_NS: + return "tsn:"; + default: + break; + } + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype: {} ", + tiledb::impl::type_to_str(datatype))); } // FIXME: Add more types, maybe make it a map enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { - if (sv == "i") - return NANOARROW_TYPE_INT32; - else if (sv == "c") - return NANOARROW_TYPE_INT8; - else if (sv == "C") - return NANOARROW_TYPE_UINT8; - else if (sv == "s") - return NANOARROW_TYPE_INT16; - else if (sv == "S") - return NANOARROW_TYPE_UINT16; - else if (sv == "I") - return NANOARROW_TYPE_UINT32; - else if (sv == "l") - return NANOARROW_TYPE_INT64; - else if (sv == "L") - return NANOARROW_TYPE_UINT64; - else if (sv == "f") - return NANOARROW_TYPE_FLOAT; - else if (sv == "g") - return NANOARROW_TYPE_DOUBLE; - else if (sv == "u") - return NANOARROW_TYPE_STRING; - else if (sv == "U") - return NANOARROW_TYPE_LARGE_STRING; - else if (sv == "b") - return NANOARROW_TYPE_BOOL; - else if (sv == "tss:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "tsm:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "tsn:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "tsu:") - return NANOARROW_TYPE_INT64; // NB time resolution set indepedently - else if (sv == "tdD") - return NANOARROW_TYPE_INT32; // R Date: fractional days since epoch - else if (sv == "z") - return NANOARROW_TYPE_BINARY; - else if (sv == "Z") - return NANOARROW_TYPE_LARGE_BINARY; - else - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); + if (sv == "i") + return NANOARROW_TYPE_INT32; + else if (sv == "c") + return NANOARROW_TYPE_INT8; + else if (sv == "C") + return NANOARROW_TYPE_UINT8; + else if (sv == "s") + return NANOARROW_TYPE_INT16; + else if (sv == "S") + return NANOARROW_TYPE_UINT16; + else if (sv == "I") + return NANOARROW_TYPE_UINT32; + else if (sv == "l") + return NANOARROW_TYPE_INT64; + else if (sv == "L") + return NANOARROW_TYPE_UINT64; + else if (sv == "f") + return NANOARROW_TYPE_FLOAT; + else if (sv == "g") + return NANOARROW_TYPE_DOUBLE; + else if (sv == "u") + return NANOARROW_TYPE_STRING; + else if (sv == "U") + return NANOARROW_TYPE_LARGE_STRING; + else if (sv == "b") + return NANOARROW_TYPE_BOOL; + else if (sv == "tss:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsm:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsn:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tsu:") + return NANOARROW_TYPE_INT64; // NB time resolution set indepedently + else if (sv == "tdD") + return NANOARROW_TYPE_INT32; // R Date: fractional days since epoch + else if (sv == "z") + return NANOARROW_TYPE_BINARY; + else if (sv == "Z") + return NANOARROW_TYPE_LARGE_BINARY; + else + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); } -} // namespace tiledbsoma +} // namespace tiledbsoma From 71d1a757701f658c876692f59d92bd74e4017b8a Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Wed, 27 Mar 2024 08:01:57 -0500 Subject: [PATCH 26/39] Heeding time to the lord of linting is time well spent some say --- libtiledbsoma/src/utils/nanoarrow.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/libtiledbsoma/src/utils/nanoarrow.h b/libtiledbsoma/src/utils/nanoarrow.h index 91c1e90708..db53e4bc94 100644 --- a/libtiledbsoma/src/utils/nanoarrow.h +++ b/libtiledbsoma/src/utils/nanoarrow.h @@ -2512,17 +2512,13 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { } static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { - *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | - ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | - ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | - ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); + *out = + (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); } static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { - *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | - ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | - ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | - ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); + *out = + (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); } static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { @@ -2634,10 +2630,10 @@ static inline void ArrowBitsSetTo( if (bytes_end == bytes_begin + 1) { // set bits within a single byte - const uint8_t only_byte_mask = i_end % 8 == 0 ? - first_byte_mask : - (uint8_t)(first_byte_mask | - last_byte_mask); + const uint8_t + only_byte_mask = i_end % 8 == 0 ? + first_byte_mask : + (uint8_t)(first_byte_mask | last_byte_mask); bits[bytes_begin] &= only_byte_mask; bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); return; @@ -2682,10 +2678,10 @@ static inline int64_t ArrowBitCountSet( const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; - const uint8_t only_byte_mask = i_end % 8 == 0 ? - last_byte_mask : - (uint8_t)(first_byte_mask & - last_byte_mask); + const uint8_t + only_byte_mask = i_end % 8 == 0 ? + last_byte_mask : + (uint8_t)(first_byte_mask & last_byte_mask); const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; return _ArrowkBytePopcount[byte_masked]; From 54e7953f4a7f37c633f2ad2d0587d128462906ba Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Wed, 27 Mar 2024 15:18:29 -0500 Subject: [PATCH 27/39] Correct another delete to free --- libtiledbsoma/src/utils/arrow_adapter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index a4ca011af6..9aa331a472 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -106,7 +106,7 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { delete arrow_buffer; if (array->buffers != nullptr) { - delete[] array->buffers; + free(array->buffers); array->buffers = nullptr; } From 06d61e10693dd536cd269449f175790901181891 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 28 Mar 2024 14:36:47 -0500 Subject: [PATCH 28/39] Additional non-nullptr protection --- libtiledbsoma/src/utils/arrow_adapter.cc | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 9aa331a472..34f652acbb 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -95,15 +95,17 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { void ArrowAdapter::release_array(struct ArrowArray* array) { auto arrow_buffer = static_cast(array->private_data); - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_array {} use_count={}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count())); - - // Delete the ArrowBuffer, which was allocated with new. - // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the - // underlying ColumnBuffer, the ColumnBuffer will be deleted. - delete arrow_buffer; + if (arrow_buffer != nullptr) { + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_array {} use_count={}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count())); + + // Delete the ArrowBuffer, which was allocated with new. + // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the + // underlying ColumnBuffer, the ColumnBuffer will be deleted. + delete arrow_buffer; + } if (array->buffers != nullptr) { free(array->buffers); From f64de8eb5ad1fcd7c5b012e8ee4ed7ebc0ec2ffd Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 1 Apr 2024 14:39:21 -0500 Subject: [PATCH 29/39] make format --- libtiledbsoma/src/utils/arrow_adapter.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 34f652acbb..89326c8a11 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -96,15 +96,15 @@ void ArrowAdapter::release_schema(struct ArrowSchema* schema) { void ArrowAdapter::release_array(struct ArrowArray* array) { auto arrow_buffer = static_cast(array->private_data); if (arrow_buffer != nullptr) { - LOG_TRACE(fmt::format( - "[ArrowAdapter] release_array {} use_count={}", - arrow_buffer->buffer_->name(), - arrow_buffer->buffer_.use_count())); - - // Delete the ArrowBuffer, which was allocated with new. - // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the - // underlying ColumnBuffer, the ColumnBuffer will be deleted. - delete arrow_buffer; + LOG_TRACE(fmt::format( + "[ArrowAdapter] release_array {} use_count={}", + arrow_buffer->buffer_->name(), + arrow_buffer->buffer_.use_count())); + + // Delete the ArrowBuffer, which was allocated with new. + // If the ArrowBuffer.buffer_ shared_ptr is the last reference to the + // underlying ColumnBuffer, the ColumnBuffer will be deleted. + delete arrow_buffer; } if (array->buffers != nullptr) { From d6efd8cc05aeab1a642b3edb88ab0ace35fe0cb8 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 1 Apr 2024 15:56:02 -0500 Subject: [PATCH 30/39] Additional test conditioner --- apis/r/tests/testthat/test-SOMADataFrame.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index 61b9ebace1..5984b7887d 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -783,7 +783,7 @@ test_that("missing levels in enums", { test_that("factor levels can grow without overlap", { - + skip_if(!extended_tests()) uri <- tempfile() schema <- arrow::schema(arrow::field(name = "soma_joinid", type = arrow::int64()), arrow::field(name = "obs_col_like", @@ -822,6 +822,7 @@ test_that("factor levels can grow without overlap", { }) test_that("factor levels cannot extend beyond index limit", { + skip_if(!extended_tests()) for (tp in c("INT8", "UINT8")) { uri <- tempfile() idx_type <- if (tp == "INT8") arrow::int8() else arrow::uint8() From ace4cc7cd6017ce600efe793772d4f25aa88ecbb Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 14:08:23 -0500 Subject: [PATCH 31/39] Correcting one buffer size selection --- libtiledbsoma/src/utils/arrow_adapter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 89326c8a11..ad3c22ab53 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -445,7 +445,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError( ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); - const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == 0 ? 3 : 2; + const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == true ? 3 : 2; dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); dict_arr->buffers[0] = nullptr; // validity: none here dict_arr->release = &release_array; From 1c78d37e8a104f94a40e7c0af185010969657e6f Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 14:27:55 -0500 Subject: [PATCH 32/39] make format --- libtiledbsoma/src/utils/arrow_adapter.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index ad3c22ab53..7ecfc6dca8 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -445,7 +445,8 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { exitIfError( ArrowArrayAllocateChildren(dict_arr, 0), "Bad array children alloc"); - const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == true ? 3 : 2; + const int n_buf = ArrowAdapter::_isstr(dict_sch->format) == true ? 3 : + 2; dict_arr->buffers = (const void**)malloc(sizeof(void*) * n_buf); dict_arr->buffers[0] = nullptr; // validity: none here dict_arr->release = &release_array; From c98a3963b6ded2d7ec668f23aabb430c6be6310b Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 15:47:57 -0500 Subject: [PATCH 33/39] Remove carrow.h and reference to it --- libtiledbsoma/src/CMakeLists.txt | 32 ++++++++-------- libtiledbsoma/src/cli/cli.cc | 1 - libtiledbsoma/src/utils/arrow_adapter.h | 3 -- libtiledbsoma/src/utils/carrow.h | 50 ------------------------- 4 files changed, 15 insertions(+), 71 deletions(-) delete mode 100644 libtiledbsoma/src/utils/carrow.h diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index 4aa44f4385..310549a806 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -174,7 +174,7 @@ if(SANITIZER) endif() endif() -# Install header files +# Install header files # target_sources FILE_SET is preferred with cmake>=3.23 # TODO Uncomment after finishing Python and R bindings # set(TILEDB_SOMA_PUBLIC_HEADERS @@ -189,21 +189,21 @@ endif() # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/logger_public.h # ) -install(FILES +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/soma/enums.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/logger_public.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_context.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_context.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/managed_query.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/array_buffers.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/array_buffers.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_array.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_group.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dense_ndarray.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.h - ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.h DESTINATION "include/tiledbsoma/soma" ) @@ -214,7 +214,6 @@ install(FILES ) install(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/utils/carrow.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/nanoarrow.hpp DESTINATION "include/tiledbsoma/utils/" @@ -226,11 +225,10 @@ install(FILES ) install(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/carrow.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/common.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h - ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/common.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/version.h DESTINATION "include/tiledbsoma/utils" diff --git a/libtiledbsoma/src/cli/cli.cc b/libtiledbsoma/src/cli/cli.cc index cd69c8096f..4f698ed3a8 100644 --- a/libtiledbsoma/src/cli/cli.cc +++ b/libtiledbsoma/src/cli/cli.cc @@ -33,7 +33,6 @@ #include "soma/enums.h" #include "soma/soma_array.h" #include "utils/arrow_adapter.h" -// #include "utils/carrow.h" #include "utils/logger.h" using namespace tiledbsoma; diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 1d8ca8f6d4..6417be94c8 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -9,9 +9,6 @@ // https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-simple-int32-array #include "nanoarrow.hpp" -// #ifndef ARROW_SCHEMA_AND_ARRAY_DEFINED -// #include "carrow.h" -// #endif namespace tiledbsoma { diff --git a/libtiledbsoma/src/utils/carrow.h b/libtiledbsoma/src/utils/carrow.h deleted file mode 100644 index f97f3a7a08..0000000000 --- a/libtiledbsoma/src/utils/carrow.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef TILEDBSOMA_CARROW_H -#define TILEDBSOMA_CARROW_H -/* ************************************************************************ */ -/* - * Arrow C Data Interface - * Apache License 2.0 - * source: https://arrow.apache.org/docs/format/CDataInterface.html - */ - -#include - -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 - -struct ArrowSchema { - // Array type description - const char* format; - const char* name; - const char* metadata; - int64_t flags; - int64_t n_children; - struct ArrowSchema** children; - struct ArrowSchema* dictionary; - - // Release callback - void (*release)(struct ArrowSchema*); - // Opaque producer-specific data - void* private_data; -}; - -struct ArrowArray { - // Array data description - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void** buffers; - struct ArrowArray** children; - struct ArrowArray* dictionary; - - // Release callback - void (*release)(struct ArrowArray*); - // Opaque producer-specific data - void* private_data; -}; -/* End Arrow C API */ -/* ************************************************************************ */ -#endif // TILEDBSOMA_CARROW_H From 445589f22e663396efec841e8e6a5e7262bab58c Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 15:52:16 -0500 Subject: [PATCH 34/39] Cleanups --- apis/r/src/riterator.cpp | 8 -------- apis/r/tests/testthat/test-SCEOutgest.R | 2 -- 2 files changed, 10 deletions(-) diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index d803a6c040..b9b65621bb 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -82,8 +82,6 @@ Rcpp::List sr_setup(const std::string& uri, Rcpp::Nullable timestamp_end = R_NilValue, const std::string& loglevel = "auto") { - //Rcpp::XPtr sr_setup(const std::string& uri, - if (loglevel != "auto") { spdl::set_level(loglevel); tdbs::LOG_SET_LEVEL(loglevel); @@ -232,10 +230,6 @@ nanoarrowXPtr sr_next(Rcpp::XPtr sr) { arr->length = 0; // initial value for (size_t i=0; i chldschemaxp = schema_owning_xptr(); - //Rcpp::XPtr chldarrayxp = array_owning_xptr(); - spdl::trace("[sr_next] Accessing {} at {}", names[i], i); // now buf is a shared_ptr to ColumnBuffer @@ -244,8 +238,6 @@ nanoarrowXPtr sr_next(Rcpp::XPtr sr) { // this is pair of array and schema pointer auto pp = tdbs::ArrowAdapter::to_arrow(buf); - //memcpy((void*) sch->children[i], pp.second.get(), sizeof(ArrowSchema)); - //memcpy((void*) arr->children[i], pp.first.get(), sizeof(ArrowArray)); ArrowArrayMove(pp.first.get(), arr->children[i]); ArrowSchemaMove(pp.second.get(), sch->children[i]); diff --git a/apis/r/tests/testthat/test-SCEOutgest.R b/apis/r/tests/testthat/test-SCEOutgest.R index 4dbd6838d3..24c44b880d 100644 --- a/apis/r/tests/testthat/test-SCEOutgest.R +++ b/apis/r/tests/testthat/test-SCEOutgest.R @@ -1,5 +1,4 @@ test_that("Load SCE object from ExperimentQuery mechanics", { - #if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("trace") skip_if(!extended_tests() || covr_tests()) skip_if_not_installed('SingleCellExperiment', .MINIMUM_SCE_VERSION('c')) uri <- withr::local_tempdir("sce-experiment-query-whole") @@ -359,5 +358,4 @@ test_that("Load SCE object from indexed ExperimentQuery", { ) expect_identical(SingleCellExperiment::colPairNames(obj), 'connectivities') expect_identical(SingleCellExperiment::rowPairNames(obj), 'network') - #if (Sys.getenv("GITHUB_ACTION") != "") set_log_level("warn") }) From 4abee1bf7322271f74ffb18bf37f608ab72d1a61 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 16:02:55 -0500 Subject: [PATCH 35/39] Use nanoarrow.{c,hpp} via tiledbsoma/utils/ --- apis/r/inst/include/tiledbsoma_types.h | 2 +- apis/r/src/nanoarrow.c | 3351 --------------------- apis/r/src/nanoarrow.h | 3736 ------------------------ apis/r/src/nanoarrow.hpp | 501 ---- apis/r/src/rinterface.cpp | 8 +- apis/r/src/riterator.cpp | 6 +- apis/r/src/rutilities.cpp | 6 +- 7 files changed, 11 insertions(+), 7599 deletions(-) delete mode 100644 apis/r/src/nanoarrow.c delete mode 100644 apis/r/src/nanoarrow.h delete mode 100644 apis/r/src/nanoarrow.hpp diff --git a/apis/r/inst/include/tiledbsoma_types.h b/apis/r/inst/include/tiledbsoma_types.h index af27a1feb8..7bf0c44412 100644 --- a/apis/r/inst/include/tiledbsoma_types.h +++ b/apis/r/inst/include/tiledbsoma_types.h @@ -15,7 +15,7 @@ #define TILEDB_NO_API_DEPRECATION_WARNINGS #endif -#include // for C interface to Arrow +#include // for C interface to Arrow #include // for QueryCondition etc #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 #include diff --git a/apis/r/src/nanoarrow.c b/apis/r/src/nanoarrow.c deleted file mode 100644 index c946c01362..0000000000 --- a/apis/r/src/nanoarrow.c +++ /dev/null @@ -1,3351 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include - -#include "nanoarrow.h" - -const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } - -int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } - -ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { - if (error == NULL) { - return NANOARROW_OK; - } - - memset(error->message, 0, sizeof(error->message)); - - va_list args; - va_start(args, fmt); - int chars_needed = vsnprintf(error->message, sizeof(error->message), fmt, args); - va_end(args); - - if (chars_needed < 0) { - return EINVAL; - } else if (((size_t)chars_needed) >= sizeof(error->message)) { - return ERANGE; - } else { - return NANOARROW_OK; - } -} - -void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { - layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; - layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; - layout->buffer_data_type[1] = storage_type; - layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; - layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; - - layout->element_size_bits[0] = 1; - layout->element_size_bits[1] = 0; - layout->element_size_bits[2] = 0; - - layout->child_size_elements = 0; - - switch (storage_type) { - case NANOARROW_TYPE_UNINITIALIZED: - case NANOARROW_TYPE_NA: - layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; - layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; - layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; - layout->element_size_bits[0] = 0; - break; - - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; - layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; - layout->element_size_bits[1] = 32; - break; - - case NANOARROW_TYPE_LARGE_LIST: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; - layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; - layout->element_size_bits[1] = 64; - break; - - case NANOARROW_TYPE_STRUCT: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; - layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; - break; - - case NANOARROW_TYPE_BOOL: - layout->element_size_bits[1] = 1; - break; - - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_INT8: - layout->element_size_bits[1] = 8; - break; - - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_HALF_FLOAT: - layout->element_size_bits[1] = 16; - break; - - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_FLOAT: - layout->element_size_bits[1] = 32; - break; - case NANOARROW_TYPE_INTERVAL_MONTHS: - layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; - layout->element_size_bits[1] = 32; - break; - - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_DOUBLE: - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - layout->element_size_bits[1] = 64; - break; - - case NANOARROW_TYPE_DECIMAL128: - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - layout->element_size_bits[1] = 128; - break; - - case NANOARROW_TYPE_DECIMAL256: - layout->element_size_bits[1] = 256; - break; - - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; - break; - - case NANOARROW_TYPE_DENSE_UNION: - layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; - layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; - layout->element_size_bits[0] = 8; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; - layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; - layout->element_size_bits[1] = 32; - break; - - case NANOARROW_TYPE_SPARSE_UNION: - layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; - layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; - layout->element_size_bits[0] = 8; - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; - layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; - break; - - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; - layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; - layout->element_size_bits[1] = 32; - layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; - layout->buffer_data_type[2] = storage_type; - break; - - case NANOARROW_TYPE_LARGE_STRING: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; - layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; - layout->element_size_bits[1] = 64; - layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; - layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; - break; - case NANOARROW_TYPE_LARGE_BINARY: - layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; - layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; - layout->element_size_bits[1] = 64; - layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; - layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; - break; - - default: - break; - } -} - -void* ArrowMalloc(int64_t size) { return malloc(size); } - -void* ArrowRealloc(void* ptr, int64_t size) { return realloc(ptr, size); } - -void ArrowFree(void* ptr) { free(ptr); } - -static uint8_t* ArrowBufferAllocatorMallocReallocate( - struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, - int64_t new_size) { - NANOARROW_UNUSED(allocator); - NANOARROW_UNUSED(old_size); - return (uint8_t*)ArrowRealloc(ptr, new_size); -} - -static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, - uint8_t* ptr, int64_t size) { - NANOARROW_UNUSED(allocator); - NANOARROW_UNUSED(size); - ArrowFree(ptr); -} - -static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { - &ArrowBufferAllocatorMallocReallocate, &ArrowBufferAllocatorMallocFree, NULL}; - -struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { - return ArrowBufferAllocatorMalloc; -} - -static uint8_t* ArrowBufferAllocatorNeverReallocate( - struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, - int64_t new_size) { - NANOARROW_UNUSED(allocator); - NANOARROW_UNUSED(ptr); - NANOARROW_UNUSED(old_size); - NANOARROW_UNUSED(new_size); - return NULL; -} - -struct ArrowBufferAllocator ArrowBufferDeallocator( - void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t size), - void* private_data) { - struct ArrowBufferAllocator allocator; - allocator.reallocate = &ArrowBufferAllocatorNeverReallocate; - allocator.free = custom_free; - allocator.private_data = private_data; - return allocator; -} - -static const int kInt32DecimalDigits = 9; - -static const uint64_t kUInt32PowersOfTen[] = { - 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, - 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL}; - -// Adapted from Arrow C++ to use 32-bit words for better C portability -// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544 -static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) { - // We use strtoll for parsing, which needs input that is null-terminated - char chunk_string[16]; - - for (int64_t posn = 0; posn < value.size_bytes;) { - int64_t remaining = value.size_bytes - posn; - - int64_t group_size; - if (remaining > kInt32DecimalDigits) { - group_size = kInt32DecimalDigits; - } else { - group_size = remaining; - } - - const uint64_t multiple = kUInt32PowersOfTen[group_size]; - - memcpy(chunk_string, value.data + posn, group_size); - chunk_string[group_size] = '\0'; - uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10); - - for (int64_t i = 0; i < out_size; i++) { - uint64_t tmp = out[i]; - tmp *= multiple; - tmp += chunk; - out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL); - chunk = (uint32_t)(tmp >> 32); - } - posn += group_size; - } -} - -ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, - struct ArrowStringView value) { - // Check for sign - int is_negative = value.data[0] == '-'; - int has_sign = is_negative || value.data[0] == '+'; - value.data += has_sign; - value.size_bytes -= has_sign; - - // Check all characters are digits that are not the negative sign - for (int64_t i = 0; i < value.size_bytes; i++) { - char c = value.data[i]; - if (c < '0' || c > '9') { - return EINVAL; - } - } - - // Skip over leading 0s - int64_t n_leading_zeroes = 0; - for (int64_t i = 0; i < value.size_bytes; i++) { - if (value.data[i] == '0') { - n_leading_zeroes++; - } else { - break; - } - } - - value.data += n_leading_zeroes; - value.size_bytes -= n_leading_zeroes; - - // Use 32-bit words for portability - uint32_t words32[8]; - int n_words32 = decimal->n_words * 2; - NANOARROW_DCHECK(n_words32 <= 8); - memset(words32, 0, sizeof(words32)); - - ShiftAndAdd(value, words32, n_words32); - - if (decimal->low_word_index == 0) { - memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32); - } else { - uint64_t lo; - uint64_t hi; - - for (int i = 0; i < decimal->n_words; i++) { - lo = (uint64_t)words32[i * 2]; - hi = (uint64_t)words32[i * 2 + 1] << 32; - decimal->words[decimal->n_words - i - 1] = lo | hi; - } - } - - if (is_negative) { - ArrowDecimalNegate(decimal); - } - - return NANOARROW_OK; -} - -// Adapted from Arrow C++ for C -// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 -ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, - struct ArrowBuffer* buffer) { - int is_negative = ArrowDecimalSign(decimal) < 0; - - uint64_t words_little_endian[4]; - if (decimal->low_word_index == 0) { - memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t)); - } else { - for (int i = 0; i < decimal->n_words; i++) { - words_little_endian[i] = decimal->words[decimal->n_words - i - 1]; - } - } - - // We've already made a copy, so negate that if needed - if (is_negative) { - uint64_t carry = 1; - for (int i = 0; i < decimal->n_words; i++) { - uint64_t elem = words_little_endian[i]; - elem = ~elem + carry; - carry &= (elem == 0); - words_little_endian[i] = elem; - } - } - - // Find the most significant word that is non-zero - int most_significant_elem_idx = -1; - for (int i = decimal->n_words - 1; i >= 0; i--) { - if (words_little_endian[i] != 0) { - most_significant_elem_idx = i; - break; - } - } - - // If they are all zero, the output is just '0' - if (most_significant_elem_idx == -1) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0')); - return NANOARROW_OK; - } - - // Define segments such that each segment represents 9 digits with the - // least significant group of 9 digits first. For example, if the input represents - // 9876543210123456789, then segments will be [123456789, 876543210, 9]. - // We handle at most a signed 256 bit integer, whose maximum value occupies 77 - // characters. Thus, we need at most 9 segments. - const uint32_t k1e9 = 1000000000U; - int num_segments = 0; - uint32_t segments[9]; - memset(segments, 0, sizeof(segments)); - uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx; - - do { - // Compute remainder = words_little_endian % 1e9 and words_little_endian = - // words_little_endian / 1e9. - uint32_t remainder = 0; - uint64_t* elem = most_significant_elem; - - do { - // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); - // *elem = dividend / 1e9; - // remainder = dividend % 1e9. - uint32_t hi = (uint32_t)(*elem >> 32); - uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL); - uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi; - uint64_t quotient_hi = dividend_hi / k1e9; - remainder = (uint32_t)(dividend_hi % k1e9); - uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo; - uint64_t quotient_lo = dividend_lo / k1e9; - remainder = (uint32_t)(dividend_lo % k1e9); - - *elem = (quotient_hi << 32) | quotient_lo; - } while (elem-- != words_little_endian); - - segments[num_segments++] = remainder; - } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian); - - // We know our output has no more than 9 digits per segment, plus a negative sign, - // plus any further digits between our output of 9 digits plus enough - // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu - // including a the null terminator) is bounded properly. - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9)); - if (is_negative) { - buffer->data[buffer->size_bytes++] = '-'; - } - - // The most significant segment should have no leading zeroes - int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", - (unsigned long)segments[num_segments - 1]); - buffer->size_bytes += n_chars; - - // Subsequent output needs to be left-padded with zeroes such that each segment - // takes up exactly 9 digits. - for (int i = num_segments - 2; i >= 0; i--) { - int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu", - (unsigned long)segments[i]); - buffer->size_bytes += n_chars; - NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes); - } - - return NANOARROW_OK; -} -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include "nanoarrow.h" - -// -- changed for tiledb-r static -void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { - if (schema->format != NULL) ArrowFree((void*)schema->format); - if (schema->name != NULL) ArrowFree((void*)schema->name); - if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); - - // This object owns the memory for all the children, but those - // children may have been generated elsewhere and might have - // their own release() callback. - if (schema->children != NULL) { - for (int64_t i = 0; i < schema->n_children; i++) { - if (schema->children[i] != NULL) { - if (schema->children[i]->release != NULL) { - ArrowSchemaRelease(schema->children[i]); - } - - ArrowFree(schema->children[i]); - } - } - - ArrowFree(schema->children); - } - - // This object owns the memory for the dictionary but it - // may have been generated somewhere else and have its own - // release() callback. - if (schema->dictionary != NULL) { - if (schema->dictionary->release != NULL) { - ArrowSchemaRelease(schema->dictionary); - } - - ArrowFree(schema->dictionary); - } - - // private data not currently used - if (schema->private_data != NULL) { - ArrowFree(schema->private_data); - } - - schema->release = NULL; -} - -static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { - switch (type) { - case NANOARROW_TYPE_UNINITIALIZED: - return NULL; - case NANOARROW_TYPE_NA: - return "n"; - case NANOARROW_TYPE_BOOL: - return "b"; - - case NANOARROW_TYPE_UINT8: - return "C"; - case NANOARROW_TYPE_INT8: - return "c"; - case NANOARROW_TYPE_UINT16: - return "S"; - case NANOARROW_TYPE_INT16: - return "s"; - case NANOARROW_TYPE_UINT32: - return "I"; - case NANOARROW_TYPE_INT32: - return "i"; - case NANOARROW_TYPE_UINT64: - return "L"; - case NANOARROW_TYPE_INT64: - return "l"; - - case NANOARROW_TYPE_HALF_FLOAT: - return "e"; - case NANOARROW_TYPE_FLOAT: - return "f"; - case NANOARROW_TYPE_DOUBLE: - return "g"; - - case NANOARROW_TYPE_STRING: - return "u"; - case NANOARROW_TYPE_LARGE_STRING: - return "U"; - case NANOARROW_TYPE_BINARY: - return "z"; - case NANOARROW_TYPE_LARGE_BINARY: - return "Z"; - - case NANOARROW_TYPE_DATE32: - return "tdD"; - case NANOARROW_TYPE_DATE64: - return "tdm"; - case NANOARROW_TYPE_INTERVAL_MONTHS: - return "tiM"; - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - return "tiD"; - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - return "tin"; - - case NANOARROW_TYPE_LIST: - return "+l"; - case NANOARROW_TYPE_LARGE_LIST: - return "+L"; - case NANOARROW_TYPE_STRUCT: - return "+s"; - case NANOARROW_TYPE_MAP: - return "+m"; - - default: - return NULL; - } -} - -static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, - enum ArrowType type) { - switch (type) { - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); - ArrowSchemaInit(schema->children[0]); - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); - break; - case NANOARROW_TYPE_MAP: - NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); - NANOARROW_RETURN_NOT_OK( - ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); - schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; - NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); - ArrowSchemaInit(schema->children[0]->children[0]); - ArrowSchemaInit(schema->children[0]->children[1]); - NANOARROW_RETURN_NOT_OK( - ArrowSchemaSetName(schema->children[0]->children[0], "key")); - schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; - NANOARROW_RETURN_NOT_OK( - ArrowSchemaSetName(schema->children[0]->children[1], "value")); - break; - default: - break; - } - - return NANOARROW_OK; -} - -void ArrowSchemaInit(struct ArrowSchema* schema) { - schema->format = NULL; - schema->name = NULL; - schema->metadata = NULL; - schema->flags = ARROW_FLAG_NULLABLE; - schema->n_children = 0; - schema->children = NULL; - schema->dictionary = NULL; - schema->private_data = NULL; - schema->release = &ArrowSchemaReleaseInternal; -} - -ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { - // We don't allocate the dictionary because it has to be nullptr - // for non-dictionary-encoded arrays. - - // Set the format to a valid format string for type - const char* template_format = ArrowSchemaFormatTemplate(type); - - // If type isn't recognized and not explicitly unset - if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); - - // For types with an umabiguous child structure, allocate children - return ArrowSchemaInitChildrenIfNeeded(schema, type); -} - -ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); - NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); - for (int64_t i = 0; i < n_children; i++) { - ArrowSchemaInit(schema->children[i]); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { - ArrowSchemaInit(schema); - - int result = ArrowSchemaSetType(schema, type); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema); - return result; - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, - enum ArrowType type, int32_t fixed_size) { - if (fixed_size <= 0) { - return EINVAL; - } - - char buffer[64]; - int n_chars; - switch (type) { - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size); - break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size); - break; - default: - return EINVAL; - } - - buffer[n_chars] = '\0'; - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); - - if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { - NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, - int32_t decimal_precision, - int32_t decimal_scale) { - if (decimal_precision <= 0) { - return EINVAL; - } - - char buffer[64]; - int n_chars; - switch (type) { - case NANOARROW_TYPE_DECIMAL128: - n_chars = - snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); - break; - case NANOARROW_TYPE_DECIMAL256: - n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, - decimal_scale); - break; - default: - return EINVAL; - } - - buffer[n_chars] = '\0'; - return ArrowSchemaSetFormat(schema, buffer); -} - -static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { - switch (time_unit) { - case NANOARROW_TIME_UNIT_SECOND: - return "s"; - case NANOARROW_TIME_UNIT_MILLI: - return "m"; - case NANOARROW_TIME_UNIT_MICRO: - return "u"; - case NANOARROW_TIME_UNIT_NANO: - return "n"; - default: - return NULL; - } -} - -ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, - enum ArrowTimeUnit time_unit, - const char* timezone) { - const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); - if (time_unit_str == NULL) { - return EINVAL; - } - - char buffer[128]; - int n_chars; - switch (type) { - case NANOARROW_TYPE_TIME32: - if (timezone != NULL) { - return EINVAL; - } - - switch (time_unit) { - case NANOARROW_TIME_UNIT_MICRO: - case NANOARROW_TIME_UNIT_NANO: - return EINVAL; - default: - break; - } - - n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); - break; - case NANOARROW_TYPE_TIME64: - if (timezone != NULL) { - return EINVAL; - } - - switch (time_unit) { - case NANOARROW_TIME_UNIT_SECOND: - case NANOARROW_TIME_UNIT_MILLI: - return EINVAL; - default: - break; - } - - n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); - break; - case NANOARROW_TYPE_TIMESTAMP: - if (timezone == NULL) { - timezone = ""; - } - n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); - break; - case NANOARROW_TYPE_DURATION: - if (timezone != NULL) { - return EINVAL; - } - n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); - break; - default: - return EINVAL; - } - - if (((size_t)n_chars) >= sizeof(buffer)) { - return ERANGE; - } - - buffer[n_chars] = '\0'; - - return ArrowSchemaSetFormat(schema, buffer); -} - -ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, - int64_t n_children) { - if (n_children < 0 || n_children > 127) { - return EINVAL; - } - - // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator - char format_out[512]; - int64_t format_out_size = 512; - memset(format_out, 0, format_out_size); - int n_chars; - char* format_cursor = format_out; - - switch (type) { - case NANOARROW_TYPE_SPARSE_UNION: - n_chars = snprintf(format_cursor, format_out_size, "+us:"); - format_cursor += n_chars; - format_out_size -= n_chars; - break; - case NANOARROW_TYPE_DENSE_UNION: - n_chars = snprintf(format_cursor, format_out_size, "+ud:"); - format_cursor += n_chars; - format_out_size -= n_chars; - break; - default: - return EINVAL; - } - - if (n_children > 0) { - n_chars = snprintf(format_cursor, format_out_size, "0"); - format_cursor += n_chars; - format_out_size -= n_chars; - - for (int64_t i = 1; i < n_children; i++) { - n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i); - format_cursor += n_chars; - format_out_size -= n_chars; - } - } - - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); - - NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); - for (int64_t i = 0; i < n_children; i++) { - ArrowSchemaInit(schema->children[i]); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { - if (schema->format != NULL) { - ArrowFree((void*)schema->format); - } - - if (format != NULL) { - size_t format_size = strlen(format) + 1; - schema->format = (const char*)ArrowMalloc(format_size); - if (schema->format == NULL) { - return ENOMEM; - } - - memcpy((void*)schema->format, format, format_size); - } else { - schema->format = NULL; - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { - if (schema->name != NULL) { - ArrowFree((void*)schema->name); - } - - if (name != NULL) { - size_t name_size = strlen(name) + 1; - schema->name = (const char*)ArrowMalloc(name_size); - if (schema->name == NULL) { - return ENOMEM; - } - - memcpy((void*)schema->name, name, name_size); - } else { - schema->name = NULL; - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { - if (schema->metadata != NULL) { - ArrowFree((void*)schema->metadata); - } - - if (metadata != NULL) { - size_t metadata_size = ArrowMetadataSizeOf(metadata); - schema->metadata = (const char*)ArrowMalloc(metadata_size); - if (schema->metadata == NULL) { - return ENOMEM; - } - - memcpy((void*)schema->metadata, metadata, metadata_size); - } else { - schema->metadata = NULL; - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, - int64_t n_children) { - if (schema->children != NULL) { - return EEXIST; - } - - if (n_children > 0) { - schema->children = - (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); - - if (schema->children == NULL) { - return ENOMEM; - } - - schema->n_children = n_children; - - memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); - - for (int64_t i = 0; i < n_children; i++) { - schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); - - if (schema->children[i] == NULL) { - return ENOMEM; - } - - schema->children[i]->release = NULL; - } - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { - if (schema->dictionary != NULL) { - return EEXIST; - } - - schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); - if (schema->dictionary == NULL) { - return ENOMEM; - } - - schema->dictionary->release = NULL; - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, - struct ArrowSchema* schema_out) { - ArrowSchemaInit(schema_out); - - int result = ArrowSchemaSetFormat(schema_out, schema->format); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - - schema_out->flags = schema->flags; - - result = ArrowSchemaSetName(schema_out, schema->name); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - - result = ArrowSchemaSetMetadata(schema_out, schema->metadata); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - - result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - - for (int64_t i = 0; i < schema->n_children; i++) { - result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - } - - if (schema->dictionary != NULL) { - result = ArrowSchemaAllocateDictionary(schema_out); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - - result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); - if (result != NANOARROW_OK) { - ArrowSchemaRelease(schema_out); - return result; - } - } - - return NANOARROW_OK; -} - -static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, - enum ArrowType type) { - schema_view->type = type; - schema_view->storage_type = type; -} - -static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, - const char* format, - const char** format_end_out, - struct ArrowError* error) { - *format_end_out = format; - - // needed for decimal parsing - const char* parse_start; - char* parse_end; - - switch (format[0]) { - case 'n': - schema_view->type = NANOARROW_TYPE_NA; - schema_view->storage_type = NANOARROW_TYPE_NA; - *format_end_out = format + 1; - return NANOARROW_OK; - case 'b': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'c': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'C': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); - *format_end_out = format + 1; - return NANOARROW_OK; - case 's': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'S': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'i': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'I': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'l': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'L': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'e': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'f': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); - *format_end_out = format + 1; - return NANOARROW_OK; - case 'g': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); - *format_end_out = format + 1; - return NANOARROW_OK; - - // decimal - case 'd': - if (format[1] != ':' || format[2] == '\0') { - ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); - return EINVAL; - } - - parse_start = format + 2; - schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); - if (parse_end == parse_start || parse_end[0] != ',') { - ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); - return EINVAL; - } - - parse_start = parse_end + 1; - schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); - if (parse_end == parse_start) { - ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); - return EINVAL; - } else if (parse_end[0] != ',') { - schema_view->decimal_bitwidth = 128; - } else { - parse_start = parse_end + 1; - schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); - if (parse_start == parse_end) { - ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); - return EINVAL; - } - } - - *format_end_out = parse_end; - - switch (schema_view->decimal_bitwidth) { - case 128: - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); - return NANOARROW_OK; - case 256: - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); - return NANOARROW_OK; - default: - ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d", - (int)schema_view->decimal_bitwidth); - return EINVAL; - } - - // validity + data - case 'w': - schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; - schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; - if (format[1] != ':' || format[2] == '\0') { - ArrowErrorSet(error, "Expected ':' following 'w'"); - return EINVAL; - } - - schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); - return NANOARROW_OK; - - // validity + offset + data - case 'z': - schema_view->type = NANOARROW_TYPE_BINARY; - schema_view->storage_type = NANOARROW_TYPE_BINARY; - *format_end_out = format + 1; - return NANOARROW_OK; - case 'u': - schema_view->type = NANOARROW_TYPE_STRING; - schema_view->storage_type = NANOARROW_TYPE_STRING; - *format_end_out = format + 1; - return NANOARROW_OK; - - // validity + large_offset + data - case 'Z': - schema_view->type = NANOARROW_TYPE_LARGE_BINARY; - schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; - *format_end_out = format + 1; - return NANOARROW_OK; - case 'U': - schema_view->type = NANOARROW_TYPE_LARGE_STRING; - schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; - *format_end_out = format + 1; - return NANOARROW_OK; - - // nested types - case '+': - switch (format[1]) { - // list has validity + offset or offset - case 'l': - schema_view->storage_type = NANOARROW_TYPE_LIST; - schema_view->type = NANOARROW_TYPE_LIST; - *format_end_out = format + 2; - return NANOARROW_OK; - - // large list has validity + large_offset or large_offset - case 'L': - schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; - schema_view->type = NANOARROW_TYPE_LARGE_LIST; - *format_end_out = format + 2; - return NANOARROW_OK; - - // just validity buffer - case 'w': - if (format[2] != ':' || format[3] == '\0') { - ArrowErrorSet(error, "Expected ':' following '+w'"); - return EINVAL; - } - - schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; - schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; - schema_view->fixed_size = - (int32_t)strtol(format + 3, (char**)format_end_out, 10); - return NANOARROW_OK; - case 's': - schema_view->storage_type = NANOARROW_TYPE_STRUCT; - schema_view->type = NANOARROW_TYPE_STRUCT; - *format_end_out = format + 2; - return NANOARROW_OK; - case 'm': - schema_view->storage_type = NANOARROW_TYPE_MAP; - schema_view->type = NANOARROW_TYPE_MAP; - *format_end_out = format + 2; - return NANOARROW_OK; - - // unions - case 'u': - switch (format[2]) { - case 'd': - schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; - schema_view->type = NANOARROW_TYPE_DENSE_UNION; - break; - case 's': - schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; - schema_view->type = NANOARROW_TYPE_SPARSE_UNION; - break; - default: - ArrowErrorSet(error, - "Expected union format string +us: or " - "+ud: but found '%s'", - format); - return EINVAL; - } - - if (format[3] == ':') { - schema_view->union_type_ids = format + 4; - int64_t n_type_ids = - _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); - if (n_type_ids != schema_view->schema->n_children) { - ArrowErrorSet( - error, - "Expected union type_ids parameter to be a comma-separated list of %ld " - "values between 0 and 127 but found '%s'", - (long)schema_view->schema->n_children, schema_view->union_type_ids); - return EINVAL; - } - *format_end_out = format + strlen(format); - return NANOARROW_OK; - } else { - ArrowErrorSet(error, - "Expected union format string +us: or +ud: " - "but found '%s'", - format); - return EINVAL; - } - - default: - ArrowErrorSet(error, "Expected nested type format string but found '%s'", - format); - return EINVAL; - } - - // date/time types - case 't': - switch (format[1]) { - // date - case 'd': - switch (format[2]) { - case 'D': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); - schema_view->type = NANOARROW_TYPE_DATE32; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'm': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_DATE64; - *format_end_out = format + 3; - return NANOARROW_OK; - default: - ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", - format + 2); - return EINVAL; - } - - // time of day - case 't': - switch (format[2]) { - case 's': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); - schema_view->type = NANOARROW_TYPE_TIME32; - schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'm': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); - schema_view->type = NANOARROW_TYPE_TIME32; - schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'u': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_TIME64; - schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'n': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_TIME64; - schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; - *format_end_out = format + 3; - return NANOARROW_OK; - default: - ArrowErrorSet( - error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", - format + 2); - return EINVAL; - } - - // timestamp - case 's': - switch (format[2]) { - case 's': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_TIMESTAMP; - schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; - break; - case 'm': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_TIMESTAMP; - schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; - break; - case 'u': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_TIMESTAMP; - schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; - break; - case 'n': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_TIMESTAMP; - schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; - break; - default: - ArrowErrorSet( - error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", - format + 2); - return EINVAL; - } - - if (format[3] != ':') { - ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, - format + 3); - return EINVAL; - } - - schema_view->timezone = format + 4; - *format_end_out = format + strlen(format); - return NANOARROW_OK; - - // duration - case 'D': - switch (format[2]) { - case 's': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_DURATION; - schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'm': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_DURATION; - schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'u': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_DURATION; - schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; - *format_end_out = format + 3; - return NANOARROW_OK; - case 'n': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); - schema_view->type = NANOARROW_TYPE_DURATION; - schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; - *format_end_out = format + 3; - return NANOARROW_OK; - default: - ArrowErrorSet(error, - "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", - format + 2); - return EINVAL; - } - - // interval - case 'i': - switch (format[2]) { - case 'M': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); - *format_end_out = format + 3; - return NANOARROW_OK; - case 'D': - ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); - *format_end_out = format + 3; - return NANOARROW_OK; - case 'n': - ArrowSchemaViewSetPrimitive(schema_view, - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); - *format_end_out = format + 3; - return NANOARROW_OK; - default: - ArrowErrorSet(error, - "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", - format + 2); - return EINVAL; - } - - default: - ArrowErrorSet( - error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", - format + 1); - return EINVAL; - } - - default: - ArrowErrorSet(error, "Unknown format: '%s'", format); - return EINVAL; - } -} - -static ArrowErrorCode ArrowSchemaViewValidateNChildren( - struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { - if (n_children != -1 && schema_view->schema->n_children != n_children) { - ArrowErrorSet(error, "Expected schema with %d children but found %d children", - (int)n_children, (int)schema_view->schema->n_children); - return EINVAL; - } - - // Don't do a full validation of children but do check that they won't - // segfault if inspected - struct ArrowSchema* child; - for (int64_t i = 0; i < schema_view->schema->n_children; i++) { - child = schema_view->schema->children[i]; - if (child == NULL) { - ArrowErrorSet(error, - "Expected valid schema at schema->children[%ld] but found NULL", - (long)i); - return EINVAL; - } else if (child->release == NULL) { - ArrowErrorSet( - error, - "Expected valid schema at schema->children[%ld] but found a released schema", - (long)i); - return EINVAL; - } - } - - return NANOARROW_OK; -} - -static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, - struct ArrowError* error) { - return ArrowSchemaViewValidateNChildren(schema_view, -1, error); -} - -static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, - struct ArrowError* error) { - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); - - if (schema_view->schema->children[0]->n_children != 2) { - ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d", - (int)schema_view->schema->children[0]->n_children); - return EINVAL; - } - - if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { - ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", - schema_view->schema->children[0]->format); - return EINVAL; - } - - if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { - ArrowErrorSet(error, - "Expected child of map type to be non-nullable but was nullable"); - return EINVAL; - } - - if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { - ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); - return EINVAL; - } - - return NANOARROW_OK; -} - -static ArrowErrorCode ArrowSchemaViewValidateDictionary( - struct ArrowSchemaView* schema_view, struct ArrowError* error) { - // check for valid index type - switch (schema_view->storage_type) { - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_INT64: - break; - default: - ArrowErrorSet( - error, - "Expected dictionary schema index type to be an integral type but found '%s'", - schema_view->schema->format); - return EINVAL; - } - - struct ArrowSchemaView dictionary_schema_view; - return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, - error); -} - -static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, - enum ArrowType type, - struct ArrowError* error) { - switch (type) { - case NANOARROW_TYPE_NA: - case NANOARROW_TYPE_BOOL: - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_HALF_FLOAT: - case NANOARROW_TYPE_FLOAT: - case NANOARROW_TYPE_DOUBLE: - case NANOARROW_TYPE_DECIMAL128: - case NANOARROW_TYPE_DECIMAL256: - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_LARGE_BINARY: - case NANOARROW_TYPE_DATE32: - case NANOARROW_TYPE_DATE64: - case NANOARROW_TYPE_INTERVAL_MONTHS: - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - case NANOARROW_TYPE_TIMESTAMP: - case NANOARROW_TYPE_TIME32: - case NANOARROW_TYPE_TIME64: - case NANOARROW_TYPE_DURATION: - return ArrowSchemaViewValidateNChildren(schema_view, 0, error); - - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - if (schema_view->fixed_size <= 0) { - ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", - schema_view->fixed_size); - return EINVAL; - } - return ArrowSchemaViewValidateNChildren(schema_view, 0, error); - - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - return ArrowSchemaViewValidateNChildren(schema_view, 1, error); - - case NANOARROW_TYPE_STRUCT: - return ArrowSchemaViewValidateNChildren(schema_view, -1, error); - - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_DENSE_UNION: - return ArrowSchemaViewValidateUnion(schema_view, error); - - case NANOARROW_TYPE_MAP: - return ArrowSchemaViewValidateMap(schema_view, error); - - case NANOARROW_TYPE_DICTIONARY: - return ArrowSchemaViewValidateDictionary(schema_view, error); - - default: - ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", - (int)schema_view->type); - return EINVAL; - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - const struct ArrowSchema* schema, - struct ArrowError* error) { - if (schema == NULL) { - ArrowErrorSet(error, "Expected non-NULL schema"); - return EINVAL; - } - - if (schema->release == NULL) { - ArrowErrorSet(error, "Expected non-released schema"); - return EINVAL; - } - - schema_view->schema = schema; - - const char* format = schema->format; - if (format == NULL) { - ArrowErrorSet( - error, - "Error parsing schema->format: Expected a null-terminated string but found NULL"); - return EINVAL; - } - - size_t format_len = strlen(format); - if (format_len == 0) { - ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); - return EINVAL; - } - - const char* format_end_out; - int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); - - if (result != NANOARROW_OK) { - if (error != NULL) { - char child_error[1024]; - memcpy(child_error, ArrowErrorMessage(error), 1024); - ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); - } - - return result; - } - - if ((format + format_len) != format_end_out) { - ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters", - format, (int)(format_end_out - format), (int)(format_len)); - return EINVAL; - } - - if (schema->dictionary != NULL) { - schema_view->type = NANOARROW_TYPE_DICTIONARY; - } - - NANOARROW_RETURN_NOT_OK( - ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); - - if (schema_view->storage_type != schema_view->type) { - NANOARROW_RETURN_NOT_OK( - ArrowSchemaViewValidate(schema_view, schema_view->type, error)); - } - - int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; - if (unknown_flags != 0) { - ArrowErrorSet(error, "Unknown ArrowSchema flag"); - return EINVAL; - } - - if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && - schema_view->type != NANOARROW_TYPE_DICTIONARY) { - ArrowErrorSet(error, - "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); - return EINVAL; - } - - if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && - schema_view->type != NANOARROW_TYPE_MAP) { - ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); - return EINVAL; - } - - ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); - if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { - schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8; - } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { - schema_view->layout.child_size_elements = schema_view->fixed_size; - } - - schema_view->extension_name = ArrowCharView(NULL); - schema_view->extension_metadata = ArrowCharView(NULL); - NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, - ArrowCharView("ARROW:extension:name"), - &schema_view->extension_name)); - NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, - ArrowCharView("ARROW:extension:metadata"), - &schema_view->extension_metadata)); - - return NANOARROW_OK; -} - -static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, - char* out, int64_t n) { - const char* type_string = ArrowTypeString(schema_view->type); - switch (schema_view->type) { - case NANOARROW_TYPE_DECIMAL128: - case NANOARROW_TYPE_DECIMAL256: - return snprintf(out, n, "%s(%d, %d)", type_string, - (int)schema_view->decimal_precision, - (int)schema_view->decimal_scale); - case NANOARROW_TYPE_TIMESTAMP: - return snprintf(out, n, "%s('%s', '%s')", type_string, - ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); - case NANOARROW_TYPE_TIME32: - case NANOARROW_TYPE_TIME64: - case NANOARROW_TYPE_DURATION: - return snprintf(out, n, "%s('%s')", type_string, - ArrowTimeUnitString(schema_view->time_unit)); - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size); - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_DENSE_UNION: - return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); - default: - return snprintf(out, n, "%s", type_string); - } -} - -// Helper for bookkeeping to emulate sprintf()-like behaviour spread -// among multiple sprintf calls. -static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, - int64_t* n_remaining, int64_t* n_chars) { - *n_chars += n_chars_last; - *n_remaining -= n_chars_last; - - // n_remaining is never less than 0 - if (*n_remaining < 0) { - *n_remaining = 0; - } - - // Can't do math on a NULL pointer - if (*out != NULL) { - *out += n_chars_last; - } -} - -int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, - char recursive) { - if (schema == NULL) { - return snprintf(out, n, "[invalid: pointer is null]"); - } - - if (schema->release == NULL) { - return snprintf(out, n, "[invalid: schema is released]"); - } - - struct ArrowSchemaView schema_view; - struct ArrowError error; - - if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { - return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); - } - - // Extension type and dictionary should include both the top-level type - // and the storage type. - int is_extension = schema_view.extension_name.size_bytes > 0; - int is_dictionary = schema->dictionary != NULL; - int64_t n_chars = 0; - int64_t n_chars_last = 0; - - // Uncommon but not technically impossible that both are true - if (is_extension && is_dictionary) { - n_chars_last = snprintf( - out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, - schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); - } else if (is_extension) { - n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, - schema_view.extension_name.data); - } else if (is_dictionary) { - n_chars_last = - snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); - } - - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - - if (!is_dictionary) { - n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); - } else { - n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); - } - - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - - if (recursive && schema->format[0] == '+') { - n_chars_last = snprintf(out, n, "<"); - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - - for (int64_t i = 0; i < schema->n_children; i++) { - if (i > 0) { - n_chars_last = snprintf(out, n, ", "); - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - } - - // ArrowSchemaToStringInternal() will validate the child and print the error, - // but we need the name first - if (schema->children[i] != NULL && schema->children[i]->release != NULL && - schema->children[i]->name != NULL) { - n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - } - - n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - } - - n_chars_last = snprintf(out, n, ">"); - ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); - } - - if (is_extension && is_dictionary) { - n_chars += snprintf(out, n, ">}"); - } else if (is_extension) { - n_chars += snprintf(out, n, "}"); - } else if (is_dictionary) { - n_chars += snprintf(out, n, ">"); - } - - return n_chars; -} - -ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, - const char* metadata) { - reader->metadata = metadata; - - if (reader->metadata == NULL) { - reader->offset = 0; - reader->remaining_keys = 0; - } else { - memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); - reader->offset = sizeof(int32_t); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, - struct ArrowStringView* key_out, - struct ArrowStringView* value_out) { - if (reader->remaining_keys <= 0) { - return EINVAL; - } - - int64_t pos = 0; - - int32_t key_size; - memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); - pos += sizeof(int32_t); - - key_out->data = reader->metadata + reader->offset + pos; - key_out->size_bytes = key_size; - pos += key_size; - - int32_t value_size; - memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); - pos += sizeof(int32_t); - - value_out->data = reader->metadata + reader->offset + pos; - value_out->size_bytes = value_size; - pos += value_size; - - reader->offset += pos; - reader->remaining_keys--; - return NANOARROW_OK; -} - -int64_t ArrowMetadataSizeOf(const char* metadata) { - if (metadata == NULL) { - return 0; - } - - struct ArrowMetadataReader reader; - struct ArrowStringView key; - struct ArrowStringView value; - if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { - return 0; - } - - int64_t size = sizeof(int32_t); - while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { - size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; - } - - return size; -} - -static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, - struct ArrowStringView* key, - struct ArrowStringView* value_out) { - struct ArrowMetadataReader reader; - struct ArrowStringView existing_key; - struct ArrowStringView existing_value; - NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); - - while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == - NANOARROW_OK) { - int key_equal = key->size_bytes == existing_key.size_bytes && - strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; - if (key_equal) { - value_out->data = existing_value.data; - value_out->size_bytes = existing_value.size_bytes; - break; - } - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, - struct ArrowStringView* value_out) { - if (value_out == NULL) { - return EINVAL; - } - - return ArrowMetadataGetValueInternal(metadata, &key, value_out); -} - -char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { - struct ArrowStringView value = ArrowCharView(NULL); - if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { - return 0; - } - - return value.data != NULL; -} - -ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, - const char* metadata) { - ArrowBufferInit(buffer); - return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); -} - -static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, - struct ArrowStringView* key, - struct ArrowStringView* value) { - if (value == NULL) { - return NANOARROW_OK; - } - - if (buffer->capacity_bytes == 0) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); - } - - if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { - return EINVAL; - } - - int32_t n_keys; - memcpy(&n_keys, buffer->data, sizeof(int32_t)); - - int32_t key_size = (int32_t)key->size_bytes; - int32_t value_size = (int32_t)value->size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( - buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); - - ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); - ArrowBufferAppendUnsafe(buffer, key->data, key_size); - ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); - ArrowBufferAppendUnsafe(buffer, value->data, value_size); - - n_keys++; - memcpy(buffer->data, &n_keys, sizeof(int32_t)); - - return NANOARROW_OK; -} - -static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, - struct ArrowStringView* key, - struct ArrowStringView* value) { - // Inspect the current value to see if we can avoid copying the buffer - struct ArrowStringView current_value = ArrowCharView(NULL); - NANOARROW_RETURN_NOT_OK( - ArrowMetadataGetValueInternal((const char*)buffer->data, key, ¤t_value)); - - // The key should be removed but no key exists - if (value == NULL && current_value.data == NULL) { - return NANOARROW_OK; - } - - // The key/value can be appended because no key exists - if (value != NULL && current_value.data == NULL) { - return ArrowMetadataBuilderAppendInternal(buffer, key, value); - } - - struct ArrowMetadataReader reader; - struct ArrowStringView existing_key; - struct ArrowStringView existing_value; - NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); - - struct ArrowBuffer new_buffer; - NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); - - while (reader.remaining_keys > 0) { - int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); - if (result != NANOARROW_OK) { - ArrowBufferReset(&new_buffer); - return result; - } - - if (key->size_bytes == existing_key.size_bytes && - strncmp((const char*)key->data, (const char*)existing_key.data, - existing_key.size_bytes) == 0) { - result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); - value = NULL; - } else { - result = - ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); - } - - if (result != NANOARROW_OK) { - ArrowBufferReset(&new_buffer); - return result; - } - } - - ArrowBufferReset(buffer); - ArrowBufferMove(&new_buffer, buffer); - return NANOARROW_OK; -} - -ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value) { - return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); -} - -ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value) { - return ArrowMetadataBuilderSetInternal(buffer, &key, &value); -} - -ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, - struct ArrowStringView key) { - return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); -} -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include "nanoarrow.h" - -// -- changed for tiledb-r static -void ArrowArrayReleaseInternal(struct ArrowArray* array) { - // Release buffers held by this array - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - if (private_data != NULL) { - ArrowBitmapReset(&private_data->bitmap); - ArrowBufferReset(&private_data->buffers[0]); - ArrowBufferReset(&private_data->buffers[1]); - ArrowFree(private_data); - } - - // This object owns the memory for all the children, but those - // children may have been generated elsewhere and might have - // their own release() callback. - if (array->children != NULL) { - for (int64_t i = 0; i < array->n_children; i++) { - if (array->children[i] != NULL) { - if (array->children[i]->release != NULL) { - ArrowArrayRelease(array->children[i]); - } - - ArrowFree(array->children[i]); - } - } - - ArrowFree(array->children); - } - - // This object owns the memory for the dictionary but it - // may have been generated somewhere else and have its own - // release() callback. - if (array->dictionary != NULL) { - if (array->dictionary->release != NULL) { - ArrowArrayRelease(array->dictionary); - } - - ArrowFree(array->dictionary); - } - - // Mark released - array->release = NULL; -} - -// -- changed for tiledb-r static -ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, - enum ArrowType storage_type) { - switch (storage_type) { - case NANOARROW_TYPE_UNINITIALIZED: - case NANOARROW_TYPE_NA: - array->n_buffers = 0; - break; - - case NANOARROW_TYPE_FIXED_SIZE_LIST: - case NANOARROW_TYPE_STRUCT: - case NANOARROW_TYPE_SPARSE_UNION: - array->n_buffers = 1; - break; - - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_MAP: - case NANOARROW_TYPE_BOOL: - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_HALF_FLOAT: - case NANOARROW_TYPE_FLOAT: - case NANOARROW_TYPE_DOUBLE: - case NANOARROW_TYPE_DECIMAL128: - case NANOARROW_TYPE_DECIMAL256: - case NANOARROW_TYPE_INTERVAL_MONTHS: - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - case NANOARROW_TYPE_DENSE_UNION: - array->n_buffers = 2; - break; - - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_LARGE_BINARY: - array->n_buffers = 3; - break; - - default: - return EINVAL; - - return NANOARROW_OK; - } - - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - private_data->storage_type = storage_type; - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, - enum ArrowType storage_type) { - array->length = 0; - array->null_count = 0; - array->offset = 0; - array->n_buffers = 0; - array->n_children = 0; - array->buffers = NULL; - array->children = NULL; - array->dictionary = NULL; - array->release = &ArrowArrayReleaseInternal; - array->private_data = NULL; - - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); - if (private_data == NULL) { - array->release = NULL; - return ENOMEM; - } - - ArrowBitmapInit(&private_data->bitmap); - ArrowBufferInit(&private_data->buffers[0]); - ArrowBufferInit(&private_data->buffers[1]); - private_data->buffer_data[0] = NULL; - private_data->buffer_data[1] = NULL; - private_data->buffer_data[2] = NULL; - - array->private_data = private_data; - array->buffers = (const void**)(&private_data->buffer_data); - - int result = ArrowArraySetStorageType(array, storage_type); - if (result != NANOARROW_OK) { - ArrowArrayRelease(array); - return result; - } - - ArrowLayoutInit(&private_data->layout, storage_type); - // We can only know this not to be true when initializing based on a schema - // so assume this to be true. - private_data->union_type_id_is_child_index = 1; - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, - const struct ArrowArrayView* array_view, - struct ArrowError* error) { - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowArrayInitFromType(array, array_view->storage_type), error); - int result; - - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - private_data->layout = array_view->layout; - - if (array_view->n_children > 0) { - result = ArrowArrayAllocateChildren(array, array_view->n_children); - if (result != NANOARROW_OK) { - ArrowArrayRelease(array); - return result; - } - - for (int64_t i = 0; i < array_view->n_children; i++) { - result = - ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); - if (result != NANOARROW_OK) { - ArrowArrayRelease(array); - return result; - } - } - } - - if (array_view->dictionary != NULL) { - result = ArrowArrayAllocateDictionary(array); - if (result != NANOARROW_OK) { - ArrowArrayRelease(array); - return result; - } - - result = - ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); - if (result != NANOARROW_OK) { - ArrowArrayRelease(array); - return result; - } - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - const struct ArrowSchema* schema, - struct ArrowError* error) { - struct ArrowArrayView array_view; - NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); - NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); - if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || - array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - // We can still build arrays if this isn't true; however, the append - // functions won't work. Instead, we store this value and error only - // when StartAppending is called. - private_data->union_type_id_is_child_index = - _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); - } - - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { - if (array->children != NULL) { - return EINVAL; - } - - if (n_children == 0) { - return NANOARROW_OK; - } - - array->children = - (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); - if (array->children == NULL) { - return ENOMEM; - } - - memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); - - for (int64_t i = 0; i < n_children; i++) { - array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); - if (array->children[i] == NULL) { - return ENOMEM; - } - array->children[i]->release = NULL; - } - - array->n_children = n_children; - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { - if (array->dictionary != NULL) { - return EINVAL; - } - - array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); - if (array->dictionary == NULL) { - return ENOMEM; - } - - array->dictionary->release = NULL; - return NANOARROW_OK; -} - -void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); - private_data->bitmap.size_bits = bitmap->size_bits; - bitmap->size_bits = 0; - private_data->buffer_data[0] = private_data->bitmap.buffer.data; - array->null_count = -1; -} - -ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, - struct ArrowBuffer* buffer) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - switch (i) { - case 0: - ArrowBufferMove(buffer, &private_data->bitmap.buffer); - private_data->buffer_data[i] = private_data->bitmap.buffer.data; - break; - case 1: - case 2: - ArrowBufferMove(buffer, &private_data->buffers[i - 1]); - private_data->buffer_data[i] = private_data->buffers[i - 1].data; - break; - default: - return EINVAL; - } - - return NANOARROW_OK; -} - -static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, - struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - ArrowArrayViewInitFromType(array_view, private_data->storage_type); - array_view->layout = private_data->layout; - array_view->array = array; - array_view->length = array->length; - array_view->offset = array->offset; - array_view->null_count = array->null_count; - - array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; - array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; - array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; - array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; - array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; - array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; - - int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - - for (int64_t i = 0; i < array->n_children; i++) { - result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - } - - if (array->dictionary != NULL) { - result = ArrowArrayViewAllocateDictionary(array_view); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - - result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - } - - return NANOARROW_OK; -} - -static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, - struct ArrowArrayView* array_view) { - // Loop through buffers and reserve the extra space that we know about - for (int64_t i = 0; i < array->n_buffers; i++) { - // Don't reserve on a validity buffer that hasn't been allocated yet - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && - ArrowArrayBuffer(array, i)->data == NULL) { - continue; - } - - int64_t additional_size_bytes = - array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; - - if (additional_size_bytes > 0) { - NANOARROW_RETURN_NOT_OK( - ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); - } - } - - // Recursively reserve children - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK( - ArrowArrayReserveInternal(array->children[i], array_view->children[i])); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, - int64_t additional_size_elements) { - struct ArrowArrayView array_view; - NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); - - // Calculate theoretical buffer sizes (recursively) - ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); - - // Walk the structure (recursively) - int result = ArrowArrayReserveInternal(array, &array_view); - ArrowArrayViewReset(&array_view); - if (result != NANOARROW_OK) { - return result; - } - - return NANOARROW_OK; -} - -static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - // The only buffer finalizing this currently does is make sure the data - // buffer for (Large)String|Binary is never NULL - switch (private_data->storage_type) { - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - case NANOARROW_TYPE_LARGE_STRING: - if (ArrowArrayBuffer(array, 2)->data == NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0)); - } - break; - default: - break; - } - - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); - } - - if (array->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); - } - - return NANOARROW_OK; -} - -static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; - } - - for (int64_t i = 0; i < array->n_children; i++) { - ArrowArrayFlushInternalPointers(array->children[i]); - } - - if (array->dictionary != NULL) { - ArrowArrayFlushInternalPointers(array->dictionary); - } -} - -ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, - enum ArrowValidationLevel validation_level, - struct ArrowError* error) { - // Even if the data buffer is size zero, the pointer value needed to be non-null - // in some implementations (at least one version of Arrow C++ at the time this - // was added). Only do this fix if we can assume CPU data access. - if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); - } - - // Make sure the value we get with array->buffers[i] is set to the actual - // pointer (which may have changed from the original due to reallocation) - ArrowArrayFlushInternalPointers(array); - - if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { - return NANOARROW_OK; - } - - // For validation, initialize an ArrowArrayView with our known buffer sizes - struct ArrowArrayView array_view; - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), - error); - int result = ArrowArrayViewValidate(&array_view, validation_level, error); - ArrowArrayViewReset(&array_view); - return result; -} - -ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, - struct ArrowError* error) { - return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); -} - -void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, - enum ArrowType storage_type) { - memset(array_view, 0, sizeof(struct ArrowArrayView)); - array_view->storage_type = storage_type; - ArrowLayoutInit(&array_view->layout, storage_type); -} - -ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, - int64_t n_children) { - if (array_view->children != NULL) { - return EINVAL; - } - - array_view->children = - (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); - if (array_view->children == NULL) { - return ENOMEM; - } - - for (int64_t i = 0; i < n_children; i++) { - array_view->children[i] = NULL; - } - - array_view->n_children = n_children; - - for (int64_t i = 0; i < n_children; i++) { - array_view->children[i] = - (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); - if (array_view->children[i] == NULL) { - return ENOMEM; - } - ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { - if (array_view->dictionary != NULL) { - return EINVAL; - } - - array_view->dictionary = - (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); - if (array_view->dictionary == NULL) { - return ENOMEM; - } - - ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - const struct ArrowSchema* schema, - struct ArrowError* error) { - struct ArrowSchemaView schema_view; - int result = ArrowSchemaViewInit(&schema_view, schema, error); - if (result != NANOARROW_OK) { - return result; - } - - ArrowArrayViewInitFromType(array_view, schema_view.storage_type); - array_view->layout = schema_view.layout; - - result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); - if (result != NANOARROW_OK) { - ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); - ArrowArrayViewReset(array_view); - return result; - } - - for (int64_t i = 0; i < schema->n_children; i++) { - result = - ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - } - - if (schema->dictionary != NULL) { - result = ArrowArrayViewAllocateDictionary(array_view); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - - result = - ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(array_view); - return result; - } - } - - if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || - array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { - array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); - if (array_view->union_type_id_map == NULL) { - return ENOMEM; - } - - memset(array_view->union_type_id_map, -1, 256); - int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, - array_view->union_type_id_map + 128); - for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { - int8_t type_id = array_view->union_type_id_map[128 + child_index]; - array_view->union_type_id_map[type_id] = child_index; - } - } - - return NANOARROW_OK; -} - -void ArrowArrayViewReset(struct ArrowArrayView* array_view) { - if (array_view->children != NULL) { - for (int64_t i = 0; i < array_view->n_children; i++) { - if (array_view->children[i] != NULL) { - ArrowArrayViewReset(array_view->children[i]); - ArrowFree(array_view->children[i]); - } - } - - ArrowFree(array_view->children); - } - - if (array_view->dictionary != NULL) { - ArrowArrayViewReset(array_view->dictionary); - ArrowFree(array_view->dictionary); - } - - if (array_view->union_type_id_map != NULL) { - ArrowFree(array_view->union_type_id_map); - } - - ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); -} - -void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; - - switch (array_view->layout.buffer_type[i]) { - case NANOARROW_BUFFER_TYPE_VALIDITY: - array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); - continue; - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - // Probably don't want/need to rely on the producer to have allocated an - // offsets buffer of length 1 for a zero-size array - array_view->buffer_views[i].size_bytes = - (length != 0) * element_size_bytes * (length + 1); - continue; - case NANOARROW_BUFFER_TYPE_DATA: - array_view->buffer_views[i].size_bytes = - _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / - 8; - continue; - case NANOARROW_BUFFER_TYPE_TYPE_ID: - case NANOARROW_BUFFER_TYPE_UNION_OFFSET: - array_view->buffer_views[i].size_bytes = element_size_bytes * length; - continue; - case NANOARROW_BUFFER_TYPE_NONE: - array_view->buffer_views[i].size_bytes = 0; - continue; - } - } - - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRUCT: - case NANOARROW_TYPE_SPARSE_UNION: - for (int64_t i = 0; i < array_view->n_children; i++) { - ArrowArrayViewSetLength(array_view->children[i], length); - } - break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - if (array_view->n_children >= 1) { - ArrowArrayViewSetLength(array_view->children[0], - length * array_view->layout.child_size_elements); - } - default: - break; - } -} - -// This version recursively extracts information from the array and stores it -// in the array view, performing any checks that require the original array. -static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error) { - array_view->array = array; - array_view->offset = array->offset; - array_view->length = array->length; - array_view->null_count = array->null_count; - - int64_t buffers_required = 0; - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { - break; - } - - buffers_required++; - - // Set buffer pointer - array_view->buffer_views[i].data.data = array->buffers[i]; - - // If non-null, set buffer size to unknown. - if (array->buffers[i] == NULL) { - array_view->buffer_views[i].size_bytes = 0; - } else { - array_view->buffer_views[i].size_bytes = -1; - } - } - - // Check the number of buffers - if (buffers_required != array->n_buffers) { - ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", - (int)buffers_required, (int)array->n_buffers); - return EINVAL; - } - - // Check number of children - if (array_view->n_children != array->n_children) { - ArrowErrorSet(error, "Expected %ld children but found %ld children", - (long)array_view->n_children, (long)array->n_children); - return EINVAL; - } - - // Recurse for children - for (int64_t i = 0; i < array_view->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], - array->children[i], error)); - } - - // Check dictionary - if (array->dictionary == NULL && array_view->dictionary != NULL) { - ArrowErrorSet(error, "Expected dictionary but found NULL"); - return EINVAL; - } - - if (array->dictionary != NULL && array_view->dictionary == NULL) { - ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); - return EINVAL; - } - - if (array->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK( - ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); - } - - return NANOARROW_OK; -} - -static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, - struct ArrowError* error) { - if (array_view->length < 0) { - ArrowErrorSet(error, "Expected length >= 0 but found length %ld", - (long)array_view->length); - return EINVAL; - } - - if (array_view->offset < 0) { - ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", - (long)array_view->offset); - return EINVAL; - } - - // Calculate buffer sizes that do not require buffer access. If marked as - // unknown, assign the buffer size; otherwise, validate it. - int64_t offset_plus_length = array_view->offset + array_view->length; - - // Only loop over the first two buffers because the size of the third buffer - // is always data dependent for all current Arrow types. - for (int i = 0; i < 2; i++) { - int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; - // Initialize with a value that will cause an error if accidentally used uninitialized - int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; - - switch (array_view->layout.buffer_type[i]) { - case NANOARROW_BUFFER_TYPE_VALIDITY: - if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { - continue; - } - - min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); - break; - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - // Probably don't want/need to rely on the producer to have allocated an - // offsets buffer of length 1 for a zero-size array - min_buffer_size_bytes = - (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); - break; - case NANOARROW_BUFFER_TYPE_DATA: - min_buffer_size_bytes = - _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * - offset_plus_length) / - 8; - break; - case NANOARROW_BUFFER_TYPE_TYPE_ID: - case NANOARROW_BUFFER_TYPE_UNION_OFFSET: - min_buffer_size_bytes = element_size_bytes * offset_plus_length; - break; - case NANOARROW_BUFFER_TYPE_NONE: - continue; - } - - // Assign or validate buffer size - if (array_view->buffer_views[i].size_bytes == -1) { - array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; - } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { - ArrowErrorSet(error, - "Expected %s array buffer %d to have size >= %ld bytes but found " - "buffer with %ld bytes", - ArrowTypeString(array_view->storage_type), (int)i, - (long)min_buffer_size_bytes, - (long)array_view->buffer_views[i].size_bytes); - return EINVAL; - } - } - - // For list, fixed-size list and map views, we can validate the number of children - switch (array_view->storage_type) { - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - case NANOARROW_TYPE_MAP: - if (array_view->n_children != 1) { - ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", - ArrowTypeString(array_view->storage_type), - (long)array_view->n_children); - return EINVAL; - } - default: - break; - } - - // For struct, the sparse union, and the fixed-size list views, we can validate child - // lengths. - int64_t child_min_length; - switch (array_view->storage_type) { - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_STRUCT: - child_min_length = (array_view->offset + array_view->length); - for (int64_t i = 0; i < array_view->n_children; i++) { - if (array_view->children[i]->length < child_min_length) { - ArrowErrorSet( - error, - "Expected struct child %d to have length >= %ld but found child with " - "length %ld", - (int)(i + 1), (long)(child_min_length), - (long)array_view->children[i]->length); - return EINVAL; - } - } - break; - - case NANOARROW_TYPE_FIXED_SIZE_LIST: - child_min_length = (array_view->offset + array_view->length) * - array_view->layout.child_size_elements; - if (array_view->children[0]->length < child_min_length) { - ArrowErrorSet(error, - "Expected child of fixed_size_list array to have length >= %ld but " - "found array with length %ld", - (long)child_min_length, (long)array_view->children[0]->length); - return EINVAL; - } - break; - default: - break; - } - - // Recurse for children - for (int64_t i = 0; i < array_view->n_children; i++) { - NANOARROW_RETURN_NOT_OK( - ArrowArrayViewValidateMinimal(array_view->children[i], error)); - } - - // Recurse for dictionary - if (array_view->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); - } - - return NANOARROW_OK; -} - -static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, - struct ArrowError* error) { - // Perform minimal validation. This will validate or assign - // buffer sizes as long as buffer access is not required. - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); - - // Calculate buffer sizes or child lengths that require accessing the offsets - // buffer. Where appropriate, validate that the first offset is >= 0. - // If a buffer size is marked as unknown, assign it; otherwise, validate it. - int64_t offset_plus_length = array_view->offset + array_view->length; - - int64_t first_offset; - int64_t last_offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int32[0]; - if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); - return EINVAL; - } - - last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; - - // If the data buffer size is unknown, assign it; otherwise, check it - if (array_view->buffer_views[2].size_bytes == -1) { - array_view->buffer_views[2].size_bytes = last_offset; - } else if (array_view->buffer_views[2].size_bytes < last_offset) { - ArrowErrorSet(error, - "Expected %s array buffer 2 to have size >= %ld bytes but found " - "buffer with %ld bytes", - ArrowTypeString(array_view->storage_type), (long)last_offset, - (long)array_view->buffer_views[2].size_bytes); - return EINVAL; - } - } - break; - - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int64[0]; - if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); - return EINVAL; - } - - last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; - - // If the data buffer size is unknown, assign it; otherwise, check it - if (array_view->buffer_views[2].size_bytes == -1) { - array_view->buffer_views[2].size_bytes = last_offset; - } else if (array_view->buffer_views[2].size_bytes < last_offset) { - ArrowErrorSet(error, - "Expected %s array buffer 2 to have size >= %ld bytes but found " - "buffer with %ld bytes", - ArrowTypeString(array_view->storage_type), (long)last_offset, - (long)array_view->buffer_views[2].size_bytes); - return EINVAL; - } - } - break; - - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array_view->n_children; i++) { - if (array_view->children[i]->length < offset_plus_length) { - ArrowErrorSet( - error, - "Expected struct child %d to have length >= %ld but found child with " - "length %ld", - (int)(i + 1), (long)offset_plus_length, - (long)array_view->children[i]->length); - return EINVAL; - } - } - break; - - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: - if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int32[0]; - if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); - return EINVAL; - } - - last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; - if (array_view->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of %s array to have length >= %ld but found array with " - "length %ld", - ArrowTypeString(array_view->storage_type), (long)last_offset, - (long)array_view->children[0]->length); - return EINVAL; - } - } - break; - - case NANOARROW_TYPE_LARGE_LIST: - if (array_view->buffer_views[1].size_bytes != 0) { - first_offset = array_view->buffer_views[1].data.as_int64[0]; - if (first_offset < 0) { - ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", - (long)first_offset); - return EINVAL; - } - - last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; - if (array_view->children[0]->length < last_offset) { - ArrowErrorSet( - error, - "Expected child of large list array to have length >= %ld but found array " - "with length %ld", - (long)last_offset, (long)array_view->children[0]->length); - return EINVAL; - } - } - break; - default: - break; - } - - // Recurse for children - for (int64_t i = 0; i < array_view->n_children; i++) { - NANOARROW_RETURN_NOT_OK( - ArrowArrayViewValidateDefault(array_view->children[i], error)); - } - - // Recurse for dictionary - if (array_view->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error) { - // Extract information from the array into the array view - NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); - - // Run default validation. Because we've marked all non-NULL buffers as having unknown - // size, validation will also update the buffer sizes as it goes. - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error) { - // Extract information from the array into the array view - NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); - - // Run default validation. Because we've marked all non-NULL buffers as having unknown - // size, validation will also update the buffer sizes as it goes. - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); - - return NANOARROW_OK; -} - -static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, - struct ArrowError* error) { - if (view.size_bytes <= (int64_t)sizeof(int32_t)) { - return NANOARROW_OK; - } - - for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { - if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); - return EINVAL; - } - } - - return NANOARROW_OK; -} - -static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, - struct ArrowError* error) { - if (view.size_bytes <= (int64_t)sizeof(int64_t)) { - return NANOARROW_OK; - } - - for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { - if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { - ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); - return EINVAL; - } - } - - return NANOARROW_OK; -} - -static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, - int8_t max_value, struct ArrowError* error) { - for (int64_t i = 0; i < view.size_bytes; i++) { - if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { - ArrowErrorSet(error, - "[%ld] Expected buffer value between %d and %d but found value %d", - (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); - return EINVAL; - } - } - - return NANOARROW_OK; -} - -static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, - int64_t n_values, struct ArrowError* error) { - for (int64_t i = 0; i < view.size_bytes; i++) { - int item_found = 0; - for (int64_t j = 0; j < n_values; j++) { - if (view.data.as_int8[i] == values[j]) { - item_found = 1; - break; - } - } - - if (!item_found) { - ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, - (int)view.data.as_int8[i]); - return EINVAL; - } - } - - return NANOARROW_OK; -} - -static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, - struct ArrowError* error) { - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - switch (array_view->layout.buffer_type[i]) { - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - if (array_view->layout.element_size_bits[i] == 32) { - NANOARROW_RETURN_NOT_OK( - ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); - } else { - NANOARROW_RETURN_NOT_OK( - ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); - } - break; - default: - break; - } - } - - if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || - array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { - if (array_view->union_type_id_map == NULL) { - // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + - // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough - // information to validate this buffer. - ArrowErrorSet(error, - "Insufficient information provided for validation of union array"); - return EINVAL; - } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( - array_view->union_type_id_map, array_view->n_children, - array_view->n_children)) { - NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( - array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); - } else { - NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], - array_view->union_type_id_map + 128, - array_view->n_children, error)); - } - } - - if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && - array_view->union_type_id_map != NULL) { - // Check that offsets refer to child elements that actually exist - for (int64_t i = 0; i < array_view->length; i++) { - int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); - int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); - int64_t child_length = array_view->children[child_id]->length; - if (offset < 0 || offset > child_length) { - ArrowErrorSet( - error, - "[%ld] Expected union offset for child id %d to be between 0 and %ld but " - "found offset value %ld", - (long)i, (int)child_id, (long)child_length, (long)offset); - return EINVAL; - } - } - } - - // Recurse for children - for (int64_t i = 0; i < array_view->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); - } - - // Dictionary valiation not implemented - if (array_view->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); - // TODO: validate the indices - } - - return NANOARROW_OK; -} - -ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, - enum ArrowValidationLevel validation_level, - struct ArrowError* error) { - switch (validation_level) { - case NANOARROW_VALIDATION_LEVEL_NONE: - return NANOARROW_OK; - case NANOARROW_VALIDATION_LEVEL_MINIMAL: - return ArrowArrayViewValidateMinimal(array_view, error); - case NANOARROW_VALIDATION_LEVEL_DEFAULT: - return ArrowArrayViewValidateDefault(array_view, error); - case NANOARROW_VALIDATION_LEVEL_FULL: - NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); - return ArrowArrayViewValidateFull(array_view, error); - } - - ArrowErrorSet(error, "validation_level not recognized"); - return EINVAL; -} -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include "nanoarrow.h" - -struct BasicArrayStreamPrivate { - struct ArrowSchema schema; - int64_t n_arrays; - struct ArrowArray* arrays; - int64_t arrays_i; -}; - -static int ArrowBasicArrayStreamGetSchema(struct ArrowArrayStream* array_stream, - struct ArrowSchema* schema) { - if (array_stream == NULL || array_stream->release == NULL) { - return EINVAL; - } - - struct BasicArrayStreamPrivate* private_data = - (struct BasicArrayStreamPrivate*)array_stream->private_data; - return ArrowSchemaDeepCopy(&private_data->schema, schema); -} - -static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, - struct ArrowArray* array) { - if (array_stream == NULL || array_stream->release == NULL) { - return EINVAL; - } - - struct BasicArrayStreamPrivate* private_data = - (struct BasicArrayStreamPrivate*)array_stream->private_data; - - if (private_data->arrays_i == private_data->n_arrays) { - array->release = NULL; - return NANOARROW_OK; - } - - ArrowArrayMove(&private_data->arrays[private_data->arrays_i++], array); - return NANOARROW_OK; -} - -static const char* ArrowBasicArrayStreamGetLastError( - struct ArrowArrayStream* array_stream) { - NANOARROW_UNUSED(array_stream); - return NULL; -} - -static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) { - if (array_stream == NULL || array_stream->release == NULL) { - return; - } - - struct BasicArrayStreamPrivate* private_data = - (struct BasicArrayStreamPrivate*)array_stream->private_data; - - if (private_data->schema.release != NULL) { - ArrowSchemaRelease(&private_data->schema); - } - - for (int64_t i = 0; i < private_data->n_arrays; i++) { - if (private_data->arrays[i].release != NULL) { - ArrowArrayRelease(&private_data->arrays[i]); - } - } - - if (private_data->arrays != NULL) { - ArrowFree(private_data->arrays); - } - - ArrowFree(private_data); - array_stream->release = NULL; -} - -ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, - struct ArrowSchema* schema, int64_t n_arrays) { - struct BasicArrayStreamPrivate* private_data = - (struct BasicArrayStreamPrivate*)ArrowMalloc( - sizeof(struct BasicArrayStreamPrivate)); - if (private_data == NULL) { - return ENOMEM; - } - - ArrowSchemaMove(schema, &private_data->schema); - - private_data->n_arrays = n_arrays; - private_data->arrays = NULL; - private_data->arrays_i = 0; - - if (n_arrays > 0) { - private_data->arrays = - (struct ArrowArray*)ArrowMalloc(n_arrays * sizeof(struct ArrowArray)); - if (private_data->arrays == NULL) { - ArrowBasicArrayStreamRelease(array_stream); - return ENOMEM; - } - } - - for (int64_t i = 0; i < private_data->n_arrays; i++) { - private_data->arrays[i].release = NULL; - } - - array_stream->get_schema = &ArrowBasicArrayStreamGetSchema; - array_stream->get_next = &ArrowBasicArrayStreamGetNext; - array_stream->get_last_error = ArrowBasicArrayStreamGetLastError; - array_stream->release = ArrowBasicArrayStreamRelease; - array_stream->private_data = private_data; - return NANOARROW_OK; -} - -void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, - struct ArrowArray* array) { - struct BasicArrayStreamPrivate* private_data = - (struct BasicArrayStreamPrivate*)array_stream->private_data; - ArrowArrayMove(array, &private_data->arrays[i]); -} - -ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, - struct ArrowError* error) { - struct BasicArrayStreamPrivate* private_data = - (struct BasicArrayStreamPrivate*)array_stream->private_data; - - struct ArrowArrayView array_view; - NANOARROW_RETURN_NOT_OK( - ArrowArrayViewInitFromSchema(&array_view, &private_data->schema, error)); - - for (int64_t i = 0; i < private_data->n_arrays; i++) { - if (private_data->arrays[i].release != NULL) { - int result = ArrowArrayViewSetArray(&array_view, &private_data->arrays[i], error); - if (result != NANOARROW_OK) { - ArrowArrayViewReset(&array_view); - return result; - } - } - } - - ArrowArrayViewReset(&array_view); - return NANOARROW_OK; -} diff --git a/apis/r/src/nanoarrow.h b/apis/r/src/nanoarrow.h deleted file mode 100644 index e338560f1a..0000000000 --- a/apis/r/src/nanoarrow.h +++ /dev/null @@ -1,3736 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_BUILD_ID_H_INCLUDED -#define NANOARROW_BUILD_ID_H_INCLUDED - -#define NANOARROW_VERSION_MAJOR 0 -#define NANOARROW_VERSION_MINOR 4 -#define NANOARROW_VERSION_PATCH 0 -#define NANOARROW_VERSION "0.4.0-SNAPSHOT" - -#define NANOARROW_VERSION_INT \ - (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ - NANOARROW_VERSION_PATCH) - -// #define NANOARROW_NAMESPACE YourNamespaceHere - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED -#define NANOARROW_NANOARROW_TYPES_H_INCLUDED - -#include -#include - - - -#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) -#include -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -// Extra guard for versions of Arrow without the canonical guard -#ifndef ARROW_FLAG_DICTIONARY_ORDERED - -/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface -/// -/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) -/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) -/// interfaces are part of the -/// Arrow Columnar Format specification -/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for -/// documentation of these structures. -/// -/// @{ - -#ifndef ARROW_C_DATA_INTERFACE -#define ARROW_C_DATA_INTERFACE - -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 - -struct ArrowSchema { - // Array type description - const char* format; - const char* name; - const char* metadata; - int64_t flags; - int64_t n_children; - struct ArrowSchema** children; - struct ArrowSchema* dictionary; - - // Release callback - void (*release)(struct ArrowSchema*); - // Opaque producer-specific data - void* private_data; -}; - -struct ArrowArray { - // Array data description - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void** buffers; - struct ArrowArray** children; - struct ArrowArray* dictionary; - - // Release callback - void (*release)(struct ArrowArray*); - // Opaque producer-specific data - void* private_data; -}; - -#endif // ARROW_C_DATA_INTERFACE - -#ifndef ARROW_C_STREAM_INTERFACE -#define ARROW_C_STREAM_INTERFACE - -struct ArrowArrayStream { - // Callback to get the stream type - // (will be the same for all arrays in the stream). - // - // Return value: 0 if successful, an `errno`-compatible error code otherwise. - // - // If successful, the ArrowSchema must be released independently from the stream. - int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); - - // Callback to get the next array - // (if no error and the array is released, the stream has ended) - // - // Return value: 0 if successful, an `errno`-compatible error code otherwise. - // - // If successful, the ArrowArray must be released independently from the stream. - int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); - - // Callback to get optional detailed error information. - // This must only be called if the last stream operation failed - // with a non-0 return code. - // - // Return value: pointer to a null-terminated character array describing - // the last error, or NULL if no description is available. - // - // The returned pointer is only valid until the next operation on this stream - // (including release). - const char* (*get_last_error)(struct ArrowArrayStream*); - - // Release callback: release the stream's own resources. - // Note that arrays returned by `get_next` must be individually released. - void (*release)(struct ArrowArrayStream*); - - // Opaque producer-specific data - void* private_data; -}; - -#endif // ARROW_C_STREAM_INTERFACE -#endif // ARROW_FLAG_DICTIONARY_ORDERED - -/// @} - -// Utility macros -#define _NANOARROW_CONCAT(x, y) x##y -#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) - -#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) return NAME; \ - } while (0) - -#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ - NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) - -#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ - NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) - -#if defined(NANOARROW_DEBUG) -#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ - NAME, __FILE__, __LINE__); \ - return NAME; \ - } \ - } while (0) -#else -#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ - return NAME; \ - } \ - } while (0) -#endif - -#if defined(NANOARROW_DEBUG) -// For checking ArrowErrorSet() calls for valid printf format strings/arguments -// If using mingw's c99-compliant printf, we need a different format-checking attribute -#if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) -#define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ - __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) -#elif defined(__GNUC__) -#define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) -#else -#define NANOARROW_CHECK_PRINTF_ATTRIBUTE -#endif - -// For checking calls to functions that return ArrowErrorCode -#if defined(__GNUC__) && (__GNUC__ >= 4) -#define NANOARROW_CHECK_RETURN_ATTRIBUTE __attribute__((warn_unused_result)) -#elif defined(_MSC_VER) && (_MSC_VER >= 1700) -#define NANOARROW_CHECK_RETURN_ATTRIBUTE _Check_return_ -#else -#define NANOARROW_CHECK_RETURN_ATTRIBUTE -#endif - -#else -#define NANOARROW_CHECK_RETURN_ATTRIBUTE -#define NANOARROW_CHECK_PRINTF_ATTRIBUTE -#endif - -#define NANOARROW_UNUSED(x) (void)(x) - -/// \brief Return code for success. -/// \ingroup nanoarrow-errors -#define NANOARROW_OK 0 - -/// \brief Represents an errno-compatible error code -/// \ingroup nanoarrow-errors -typedef int ArrowErrorCode; - -#if defined(NANOARROW_DEBUG) -#define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode -#endif - -/// \brief Flags supported by ArrowSchemaViewInit() -/// \ingroup nanoarrow-schema-view -#define NANOARROW_FLAG_ALL_SUPPORTED \ - (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED) - -/// \brief Error type containing a UTF-8 encoded message. -/// \ingroup nanoarrow-errors -struct ArrowError { - /// \brief A character buffer with space for an error message. - char message[1024]; -}; - -/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. -/// \ingroup nanoarrow-errors -/// -/// If error is NULL, this function does nothing. -static inline void ArrowErrorInit(struct ArrowError* error) { - if (error != NULL) { - error->message[0] = '\0'; - } -} - -/// \brief Get the contents of an error -/// \ingroup nanoarrow-errors -/// -/// If error is NULL, returns "", or returns the contents of the error message -/// otherwise. -static inline const char* ArrowErrorMessage(struct ArrowError* error) { - if (error == NULL) { - return ""; - } else { - return error->message; - } -} - -/// \brief Set the contents of an error from an existing null-terminated string -/// \ingroup nanoarrow-errors -/// -/// If error is NULL, this function does nothing. -static inline void ArrowErrorSetString(struct ArrowError* error, const char* src) { - if (error == NULL) { - return; - } - - int64_t src_len = strlen(src); - if (src_len >= ((int64_t)sizeof(error->message))) { - memcpy(error->message, src, sizeof(error->message) - 1); - error->message[sizeof(error->message) - 1] = '\0'; - } else { - memcpy(error->message, src, src_len); - error->message[src_len] = '\0'; - } -} - -/// \brief Check the result of an expression and return it if not NANOARROW_OK -/// \ingroup nanoarrow-errors -#define NANOARROW_RETURN_NOT_OK(EXPR) \ - _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) - -/// \brief Check the result of an expression and return it if not NANOARROW_OK, -/// adding an auto-generated message to an ArrowError. -/// \ingroup nanoarrow-errors -/// -/// This macro is used to ensure that functions that accept an ArrowError -/// as input always set its message when returning an error code (e.g., when calling -/// a nanoarrow function that does *not* accept ArrowError). -#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ - _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ - _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) - -#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) -#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ - do { \ - fprintf(stderr, "%s failed with code %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ - __FILE__, (int)__LINE__); \ - abort(); \ - } while (0) -#endif - -#if defined(NANOARROW_DEBUG) -#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ - } while (0) - -/// \brief Assert that an expression's value is NANOARROW_OK -/// \ingroup nanoarrow-errors -/// -/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), -/// print a message to stderr and abort. If nanoarrow was built in release mode, -/// this statement has no effect. You can customize fatal error behaviour -/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h -/// This macro is provided as a convenience for users and is not used internally. -#define NANOARROW_ASSERT_OK(EXPR) \ - _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) - -#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ - do { \ - if (!(EXPR)) NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ - } while (0) - -#define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) -#else -#define NANOARROW_ASSERT_OK(EXPR) EXPR -#define NANOARROW_DCHECK(EXPR) -#endif - -static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { - NANOARROW_DCHECK(src != NULL); - NANOARROW_DCHECK(dst != NULL); - - memcpy(dst, src, sizeof(struct ArrowSchema)); - src->release = NULL; -} - -static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { - NANOARROW_DCHECK(schema != NULL); - schema->release(schema); - NANOARROW_DCHECK(schema->release == NULL); -} - -static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { - NANOARROW_DCHECK(src != NULL); - NANOARROW_DCHECK(dst != NULL); - - memcpy(dst, src, sizeof(struct ArrowArray)); - src->release = NULL; -} - -static inline void ArrowArrayRelease(struct ArrowArray* array) { - NANOARROW_DCHECK(array != NULL); - array->release(array); - NANOARROW_DCHECK(array->release == NULL); -} - -static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, - struct ArrowArrayStream* dst) { - NANOARROW_DCHECK(src != NULL); - NANOARROW_DCHECK(dst != NULL); - - memcpy(dst, src, sizeof(struct ArrowArrayStream)); - src->release = NULL; -} - -static inline const char* ArrowArrayStreamGetLastError( - struct ArrowArrayStream* array_stream) { - NANOARROW_DCHECK(array_stream != NULL); - - const char* value = array_stream->get_last_error(array_stream); - if (value == NULL) { - return ""; - } else { - return value; - } -} - -static inline ArrowErrorCode ArrowArrayStreamGetSchema( - struct ArrowArrayStream* array_stream, struct ArrowSchema* out, - struct ArrowError* error) { - NANOARROW_DCHECK(array_stream != NULL); - - int result = array_stream->get_schema(array_stream, out); - if (result != NANOARROW_OK && error != NULL) { - ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); - } - - return result; -} - -static inline ArrowErrorCode ArrowArrayStreamGetNext( - struct ArrowArrayStream* array_stream, struct ArrowArray* out, - struct ArrowError* error) { - NANOARROW_DCHECK(array_stream != NULL); - - int result = array_stream->get_next(array_stream, out); - if (result != NANOARROW_OK && error != NULL) { - ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); - } - - return result; -} - -static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream) { - NANOARROW_DCHECK(array_stream != NULL); - array_stream->release(array_stream); - NANOARROW_DCHECK(array_stream->release == NULL); -} - -static char _ArrowIsLittleEndian(void) { - uint32_t check = 1; - char first_byte; - memcpy(&first_byte, &check, sizeof(char)); - return first_byte; -} - -/// \brief Arrow type enumerator -/// \ingroup nanoarrow-utils -/// -/// These names are intended to map to the corresponding arrow::Type::type -/// enumerator; however, the numeric values are specifically not equal -/// (i.e., do not rely on numeric comparison). -enum ArrowType { - NANOARROW_TYPE_UNINITIALIZED = 0, - NANOARROW_TYPE_NA = 1, - NANOARROW_TYPE_BOOL, - NANOARROW_TYPE_UINT8, - NANOARROW_TYPE_INT8, - NANOARROW_TYPE_UINT16, - NANOARROW_TYPE_INT16, - NANOARROW_TYPE_UINT32, - NANOARROW_TYPE_INT32, - NANOARROW_TYPE_UINT64, - NANOARROW_TYPE_INT64, - NANOARROW_TYPE_HALF_FLOAT, - NANOARROW_TYPE_FLOAT, - NANOARROW_TYPE_DOUBLE, - NANOARROW_TYPE_STRING, - NANOARROW_TYPE_BINARY, - NANOARROW_TYPE_FIXED_SIZE_BINARY, - NANOARROW_TYPE_DATE32, - NANOARROW_TYPE_DATE64, - NANOARROW_TYPE_TIMESTAMP, - NANOARROW_TYPE_TIME32, - NANOARROW_TYPE_TIME64, - NANOARROW_TYPE_INTERVAL_MONTHS, - NANOARROW_TYPE_INTERVAL_DAY_TIME, - NANOARROW_TYPE_DECIMAL128, - NANOARROW_TYPE_DECIMAL256, - NANOARROW_TYPE_LIST, - NANOARROW_TYPE_STRUCT, - NANOARROW_TYPE_SPARSE_UNION, - NANOARROW_TYPE_DENSE_UNION, - NANOARROW_TYPE_DICTIONARY, - NANOARROW_TYPE_MAP, - NANOARROW_TYPE_EXTENSION, - NANOARROW_TYPE_FIXED_SIZE_LIST, - NANOARROW_TYPE_DURATION, - NANOARROW_TYPE_LARGE_STRING, - NANOARROW_TYPE_LARGE_BINARY, - NANOARROW_TYPE_LARGE_LIST, - NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO -}; - -/// \brief Get a string value of an enum ArrowType value -/// \ingroup nanoarrow-utils -/// -/// Returns NULL for invalid values for type -static inline const char* ArrowTypeString(enum ArrowType type); - -static inline const char* ArrowTypeString(enum ArrowType type) { - switch (type) { - case NANOARROW_TYPE_NA: - return "na"; - case NANOARROW_TYPE_BOOL: - return "bool"; - case NANOARROW_TYPE_UINT8: - return "uint8"; - case NANOARROW_TYPE_INT8: - return "int8"; - case NANOARROW_TYPE_UINT16: - return "uint16"; - case NANOARROW_TYPE_INT16: - return "int16"; - case NANOARROW_TYPE_UINT32: - return "uint32"; - case NANOARROW_TYPE_INT32: - return "int32"; - case NANOARROW_TYPE_UINT64: - return "uint64"; - case NANOARROW_TYPE_INT64: - return "int64"; - case NANOARROW_TYPE_HALF_FLOAT: - return "half_float"; - case NANOARROW_TYPE_FLOAT: - return "float"; - case NANOARROW_TYPE_DOUBLE: - return "double"; - case NANOARROW_TYPE_STRING: - return "string"; - case NANOARROW_TYPE_BINARY: - return "binary"; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - return "fixed_size_binary"; - case NANOARROW_TYPE_DATE32: - return "date32"; - case NANOARROW_TYPE_DATE64: - return "date64"; - case NANOARROW_TYPE_TIMESTAMP: - return "timestamp"; - case NANOARROW_TYPE_TIME32: - return "time32"; - case NANOARROW_TYPE_TIME64: - return "time64"; - case NANOARROW_TYPE_INTERVAL_MONTHS: - return "interval_months"; - case NANOARROW_TYPE_INTERVAL_DAY_TIME: - return "interval_day_time"; - case NANOARROW_TYPE_DECIMAL128: - return "decimal128"; - case NANOARROW_TYPE_DECIMAL256: - return "decimal256"; - case NANOARROW_TYPE_LIST: - return "list"; - case NANOARROW_TYPE_STRUCT: - return "struct"; - case NANOARROW_TYPE_SPARSE_UNION: - return "sparse_union"; - case NANOARROW_TYPE_DENSE_UNION: - return "dense_union"; - case NANOARROW_TYPE_DICTIONARY: - return "dictionary"; - case NANOARROW_TYPE_MAP: - return "map"; - case NANOARROW_TYPE_EXTENSION: - return "extension"; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - return "fixed_size_list"; - case NANOARROW_TYPE_DURATION: - return "duration"; - case NANOARROW_TYPE_LARGE_STRING: - return "large_string"; - case NANOARROW_TYPE_LARGE_BINARY: - return "large_binary"; - case NANOARROW_TYPE_LARGE_LIST: - return "large_list"; - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: - return "interval_month_day_nano"; - default: - return NULL; - } -} - -/// \brief Arrow time unit enumerator -/// \ingroup nanoarrow-utils -/// -/// These names and values map to the corresponding arrow::TimeUnit::type -/// enumerator. -enum ArrowTimeUnit { - NANOARROW_TIME_UNIT_SECOND = 0, - NANOARROW_TIME_UNIT_MILLI = 1, - NANOARROW_TIME_UNIT_MICRO = 2, - NANOARROW_TIME_UNIT_NANO = 3 -}; - -/// \brief Validation level enumerator -/// \ingroup nanoarrow-array -enum ArrowValidationLevel { - /// \brief Do not validate buffer sizes or content. - NANOARROW_VALIDATION_LEVEL_NONE = 0, - - /// \brief Validate buffer sizes that depend on array length but do not validate buffer - /// sizes that depend on buffer data access. - NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, - - /// \brief Validate all buffer sizes, including those that require buffer data access, - /// but do not perform any checks that are O(1) along the length of the buffers. - NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, - - /// \brief Validate all buffer sizes and all buffer content. This is useful in the - /// context of untrusted input or input that may have been corrupted in transit. - NANOARROW_VALIDATION_LEVEL_FULL = 3 -}; - -/// \brief Get a string value of an enum ArrowTimeUnit value -/// \ingroup nanoarrow-utils -/// -/// Returns NULL for invalid values for time_unit -static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); - -static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { - switch (time_unit) { - case NANOARROW_TIME_UNIT_SECOND: - return "s"; - case NANOARROW_TIME_UNIT_MILLI: - return "ms"; - case NANOARROW_TIME_UNIT_MICRO: - return "us"; - case NANOARROW_TIME_UNIT_NANO: - return "ns"; - default: - return NULL; - } -} - -/// \brief Functional types of buffers as described in the Arrow Columnar Specification -/// \ingroup nanoarrow-array-view -enum ArrowBufferType { - NANOARROW_BUFFER_TYPE_NONE, - NANOARROW_BUFFER_TYPE_VALIDITY, - NANOARROW_BUFFER_TYPE_TYPE_ID, - NANOARROW_BUFFER_TYPE_UNION_OFFSET, - NANOARROW_BUFFER_TYPE_DATA_OFFSET, - NANOARROW_BUFFER_TYPE_DATA -}; - -/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout -/// \ingroup nanoarrow-array-view -/// -/// All currently supported types have 3 buffers or fewer; however, future types -/// may involve a variable number of buffers (e.g., string view). These buffers -/// will be represented by separate members of the ArrowArrayView or ArrowLayout. -#define NANOARROW_MAX_FIXED_BUFFERS 3 - -/// \brief An non-owning view of a string -/// \ingroup nanoarrow-utils -struct ArrowStringView { - /// \brief A pointer to the start of the string - /// - /// If size_bytes is 0, this value may be NULL. - const char* data; - - /// \brief The size of the string in bytes, - /// - /// (Not including the null terminator.) - int64_t size_bytes; -}; - -/// \brief Return a view of a const C string -/// \ingroup nanoarrow-utils -static inline struct ArrowStringView ArrowCharView(const char* value); - -static inline struct ArrowStringView ArrowCharView(const char* value) { - struct ArrowStringView out; - - out.data = value; - if (value) { - out.size_bytes = (int64_t)strlen(value); - } else { - out.size_bytes = 0; - } - - return out; -} - -union ArrowBufferViewData { - const void* data; - const int8_t* as_int8; - const uint8_t* as_uint8; - const int16_t* as_int16; - const uint16_t* as_uint16; - const int32_t* as_int32; - const uint32_t* as_uint32; - const int64_t* as_int64; - const uint64_t* as_uint64; - const double* as_double; - const float* as_float; - const char* as_char; -}; - -/// \brief An non-owning view of a buffer -/// \ingroup nanoarrow-utils -struct ArrowBufferView { - /// \brief A pointer to the start of the buffer - /// - /// If size_bytes is 0, this value may be NULL. - union ArrowBufferViewData data; - - /// \brief The size of the buffer in bytes - int64_t size_bytes; -}; - -/// \brief Array buffer allocation and deallocation -/// \ingroup nanoarrow-buffer -/// -/// Container for allocate, reallocate, and free methods that can be used -/// to customize allocation and deallocation of buffers when constructing -/// an ArrowArray. -struct ArrowBufferAllocator { - /// \brief Reallocate a buffer or return NULL if it cannot be reallocated - uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t old_size, int64_t new_size); - - /// \brief Deallocate a buffer allocated by this allocator - void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); - - /// \brief Opaque data specific to the allocator - void* private_data; -}; - -/// \brief An owning mutable view of a buffer -/// \ingroup nanoarrow-buffer -struct ArrowBuffer { - /// \brief A pointer to the start of the buffer - /// - /// If capacity_bytes is 0, this value may be NULL. - uint8_t* data; - - /// \brief The size of the buffer in bytes - int64_t size_bytes; - - /// \brief The capacity of the buffer in bytes - int64_t capacity_bytes; - - /// \brief The allocator that will be used to reallocate and/or free the buffer - struct ArrowBufferAllocator allocator; -}; - -/// \brief An owning mutable view of a bitmap -/// \ingroup nanoarrow-bitmap -struct ArrowBitmap { - /// \brief An ArrowBuffer to hold the allocated memory - struct ArrowBuffer buffer; - - /// \brief The number of bits that have been appended to the bitmap - int64_t size_bits; -}; - -/// \brief A description of an arrangement of buffers -/// \ingroup nanoarrow-utils -/// -/// Contains the minimum amount of information required to -/// calculate the size of each buffer in an ArrowArray knowing only -/// the length and offset of the array. -struct ArrowLayout { - /// \brief The function of each buffer - enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; - - /// \brief The data type of each buffer - enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; - - /// \brief The size of an element each buffer or 0 if this size is variable or unknown - int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; - - /// \brief The number of elements in the child array per element in this array for a - /// fixed-size list - int64_t child_size_elements; -}; - -/// \brief A non-owning view of an ArrowArray -/// \ingroup nanoarrow-array-view -/// -/// This data structure provides access to the values contained within -/// an ArrowArray with fields provided in a more readily-extractible -/// form. You can re-use an ArrowArrayView for multiple ArrowArrays -/// with the same storage type, use it to represent a hypothetical -/// ArrowArray that does not exist yet, or use it to validate the buffers -/// of a future ArrowArray. -struct ArrowArrayView { - /// \brief The underlying ArrowArray or NULL if it has not been set or - /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. - const struct ArrowArray* array; - - /// \brief The number of elements from the physical start of the buffers. - int64_t offset; - - /// \brief The number of elements in this view. - int64_t length; - - /// \brief A cached null count or -1 to indicate that this value is unknown. - int64_t null_count; - - /// \brief The type used to store values in this array - /// - /// This type represents only the minimum required information to - /// extract values from the array buffers (e.g., for a Date32 array, - /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded - /// arrays, this will be the index type. - enum ArrowType storage_type; - - /// \brief The buffer types, strides, and sizes of this Array's buffers - struct ArrowLayout layout; - - /// \brief This Array's buffers as ArrowBufferView objects - struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; - - /// \brief The number of children of this view - int64_t n_children; - - /// \brief Pointers to views of this array's children - struct ArrowArrayView** children; - - /// \brief Pointer to a view of this array's dictionary - struct ArrowArrayView* dictionary; - - /// \brief Union type id to child index mapping - /// - /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer - /// such that child_index == union_type_id_map[type_id] and - /// type_id == union_type_id_map[128 + child_index]. This value may be - /// NULL in the case where child_id == type_id. - int8_t* union_type_id_map; -}; - -// Used as the private data member for ArrowArrays allocated here and accessed -// internally within inline ArrowArray* helpers. -struct ArrowArrayPrivateData { - // Holder for the validity buffer (or first buffer for union types, which are - // the only type whose first buffer is not a valdiity buffer) - struct ArrowBitmap bitmap; - - // Holder for additional buffers as required - struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; - - // The array of pointers to buffers. This must be updated after a sequence - // of appends to synchronize its values with the actual buffer addresses - // (which may have ben reallocated uring that time) - const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; - - // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown - enum ArrowType storage_type; - - // The buffer arrangement for the storage type - struct ArrowLayout layout; - - // Flag to indicate if there are non-sequence union type ids. - // In the future this could be replaced with a type id<->child mapping - // to support constructing unions in append mode where type_id != child_index - int8_t union_type_id_is_child_index; -}; - -/// \brief A representation of an interval. -/// \ingroup nanoarrow-utils -struct ArrowInterval { - /// \brief The type of interval being used - enum ArrowType type; - /// \brief The number of months represented by the interval - int32_t months; - /// \brief The number of days represented by the interval - int32_t days; - /// \brief The number of ms represented by the interval - int32_t ms; - /// \brief The number of ns represented by the interval - int64_t ns; -}; - -/// \brief Zero initialize an Interval with a given unit -/// \ingroup nanoarrow-utils -static inline void ArrowIntervalInit(struct ArrowInterval* interval, - enum ArrowType type) { - memset(interval, 0, sizeof(struct ArrowInterval)); - interval->type = type; -} - -/// \brief A representation of a fixed-precision decimal number -/// \ingroup nanoarrow-utils -/// -/// This structure should be initialized with ArrowDecimalInit() once and -/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), -/// or ArrowDecimalSetBytes256(). -struct ArrowDecimal { - /// \brief An array of 64-bit integers of n_words length defined in native-endian order - uint64_t words[4]; - - /// \brief The number of significant digits this decimal number can represent - int32_t precision; - - /// \brief The number of digits after the decimal point. This can be negative. - int32_t scale; - - /// \brief The number of words in the words array - int n_words; - - /// \brief Cached value used by the implementation - int high_word_index; - - /// \brief Cached value used by the implementation - int low_word_index; -}; - -/// \brief Initialize a decimal with a given set of type parameters -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, - int32_t precision, int32_t scale) { - memset(decimal->words, 0, sizeof(decimal->words)); - decimal->precision = precision; - decimal->scale = scale; - decimal->n_words = bitwidth / 8 / sizeof(uint64_t); - - if (_ArrowIsLittleEndian()) { - decimal->low_word_index = 0; - decimal->high_word_index = decimal->n_words - 1; - } else { - decimal->low_word_index = decimal->n_words - 1; - decimal->high_word_index = 0; - } -} - -/// \brief Get a signed integer value of a sufficiently small ArrowDecimal -/// -/// This does not check if the decimal's precision sufficiently small to fit -/// within the signed 64-bit integer range (A precision less than or equal -/// to 18 is sufficiently small). -static inline int64_t ArrowDecimalGetIntUnsafe(const struct ArrowDecimal* decimal) { - return (int64_t)decimal->words[decimal->low_word_index]; -} - -/// \brief Copy the bytes of this decimal into a sufficiently large buffer -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalGetBytes(const struct ArrowDecimal* decimal, - uint8_t* out) { - memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); -} - -/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise -/// \ingroup nanoarrow-utils -static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { - return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); -} - -/// \brief Sets the integer value of this decimal -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { - if (value < 0) { - memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); - } else { - memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); - } - - decimal->words[decimal->low_word_index] = value; -} - -/// \brief Negate the value of this decimal in place -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { - uint64_t carry = 1; - - if (decimal->low_word_index == 0) { - for (int i = 0; i < decimal->n_words; i++) { - uint64_t elem = decimal->words[i]; - elem = ~elem + carry; - carry &= (elem == 0); - decimal->words[i] = elem; - } - } else { - for (int i = decimal->low_word_index; i >= 0; i--) { - uint64_t elem = decimal->words[i]; - elem = ~elem + carry; - carry &= (elem == 0); - decimal->words[i] = elem; - } - } -} - -/// \brief Copy bytes from a buffer into this decimal -/// \ingroup nanoarrow-utils -static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, - const uint8_t* value) { - memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); -} - -#ifdef __cplusplus -} -#endif - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_H_INCLUDED -#define NANOARROW_H_INCLUDED - -#include -#include -#include - - - -// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this -// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE -// MyNamespace here. - -// This section remaps the non-prefixed symbols to the prefixed symbols so that -// code written against this build can be used independent of the value of -// NANOARROW_NAMESPACE. -#ifdef NANOARROW_NAMESPACE -#define NANOARROW_CAT(A, B) A##B -#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) - -#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) -#define ArrowNanoarrowVersionInt \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) -#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) -#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) -#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) -#define ArrowBufferAllocatorDefault \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) -#define ArrowBufferDeallocator \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) -#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) -#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) -#define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) -#define ArrowDecimalAppendDigitsToBuffer \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) -#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) -#define ArrowSchemaInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) -#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) -#define ArrowSchemaSetTypeStruct \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) -#define ArrowSchemaSetTypeFixedSize \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) -#define ArrowSchemaSetTypeDecimal \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) -#define ArrowSchemaSetTypeDateTime \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) -#define ArrowSchemaSetTypeUnion \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) -#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) -#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) -#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) -#define ArrowSchemaSetMetadata \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) -#define ArrowSchemaAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) -#define ArrowSchemaAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) -#define ArrowMetadataReaderInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) -#define ArrowMetadataReaderRead \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) -#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) -#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) -#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) -#define ArrowMetadataBuilderInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) -#define ArrowMetadataBuilderAppend \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) -#define ArrowMetadataBuilderSet \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) -#define ArrowMetadataBuilderRemove \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) -#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) -#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) -#define ArrowArrayInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) -#define ArrowArrayInitFromSchema \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) -#define ArrowArrayInitFromArrayView \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) -#define ArrowArrayInitFromArrayView \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) -#define ArrowArrayAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) -#define ArrowArrayAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) -#define ArrowArraySetValidityBitmap \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) -#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) -#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) -#define ArrowArrayFinishBuilding \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) -#define ArrowArrayFinishBuildingDefault \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) -#define ArrowArrayViewInitFromType \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) -#define ArrowArrayViewInitFromSchema \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) -#define ArrowArrayViewAllocateChildren \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) -#define ArrowArrayViewAllocateDictionary \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) -#define ArrowArrayViewSetLength \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) -#define ArrowArrayViewSetArray \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) -#define ArrowArrayViewSetArrayMinimal \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) -#define ArrowArrayViewValidate \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) -#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) -#define ArrowBasicArrayStreamInit \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) -#define ArrowBasicArrayStreamSetArray \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) -#define ArrowBasicArrayStreamValidate \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) - -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/// \defgroup nanoarrow Nanoarrow C library -/// -/// Except where noted, objects are not thread-safe and clients should -/// take care to serialize accesses to methods. -/// -/// Because this library is intended to be vendored, it provides full type -/// definitions and encourages clients to stack or statically allocate -/// where convenient. - -/// \defgroup nanoarrow-malloc Memory management -/// -/// Non-buffer members of a struct ArrowSchema and struct ArrowArray -/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed -/// using ArrowFree() for schemas and arrays allocated here. Buffer members -/// are allocated using an ArrowBufferAllocator. -/// -/// @{ - -/// \brief Allocate like malloc() -void* ArrowMalloc(int64_t size); - -/// \brief Reallocate like realloc() -void* ArrowRealloc(void* ptr, int64_t size); - -/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). -void ArrowFree(void* ptr); - -/// \brief Return the default allocator -/// -/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and -/// ArrowFree(). -struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); - -/// \brief Create a custom deallocator -/// -/// Creates a buffer allocator with only a free method that can be used to -/// attach a custom deallocator to an ArrowBuffer. This may be used to -/// avoid copying an existing buffer that was not allocated using the -/// infrastructure provided here (e.g., by an R or Python object). -struct ArrowBufferAllocator ArrowBufferDeallocator( - void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, - int64_t size), - void* private_data); - -/// @} - -/// \brief Move the contents of an src ArrowSchema into dst and set src->release to NULL -/// \ingroup nanoarrow-arrow-cdata -static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst); - -/// \brief Call the release callback of an ArrowSchema -/// \ingroup nanoarrow-arrow-cdata -static inline void ArrowSchemaRelease(struct ArrowSchema* schema); - -/// \brief Move the contents of an src ArrowArray into dst and set src->release to NULL -/// \ingroup nanoarrow-arrow-cdata -static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst); - -/// \brief Call the release callback of an ArrowArray -static inline void ArrowArrayRelease(struct ArrowArray* array); - -/// \brief Move the contents of an src ArrowArrayStream into dst and set src->release to -/// NULL \ingroup nanoarrow-arrow-cdata -static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, - struct ArrowArrayStream* dst); - -/// \brief Call the get_schema callback of an ArrowArrayStream -/// \ingroup nanoarrow-arrow-cdata -/// -/// Unlike the get_schema callback, this wrapper checks the return code -/// and propagates the error reported by get_last_error into error. This -/// makes it significantly less verbose to iterate over array streams -/// using NANOARROW_RETURN_NOT_OK()-style error handling. -static inline ArrowErrorCode ArrowArrayStreamGetSchema( - struct ArrowArrayStream* array_stream, struct ArrowSchema* out, - struct ArrowError* error); - -/// \brief Call the get_schema callback of an ArrowArrayStream -/// \ingroup nanoarrow-arrow-cdata -/// -/// Unlike the get_next callback, this wrapper checks the return code -/// and propagates the error reported by get_last_error into error. This -/// makes it significantly less verbose to iterate over array streams -/// using NANOARROW_RETURN_NOT_OK()-style error handling. -static inline ArrowErrorCode ArrowArrayStreamGetNext( - struct ArrowArrayStream* array_stream, struct ArrowArray* out, - struct ArrowError* error); - -/// \brief Call the get_next callback of an ArrowArrayStream -/// \ingroup nanoarrow-arrow-cdata -/// -/// Unlike the get_next callback, this function never returns NULL (i.e., its -/// result is safe to use in printf-style error formatters). Null values from the -/// original callback are reported as "". -static inline const char* ArrowArrayStreamGetLastError( - struct ArrowArrayStream* array_stream); - -/// \brief Call the release callback of an ArrowArrayStream -static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream); - -/// \defgroup nanoarrow-errors Error handling -/// -/// Functions generally return an errno-compatible error code; functions that -/// need to communicate more verbose error information accept a pointer -/// to an ArrowError. This can be stack or statically allocated. The -/// content of the message is undefined unless an error code has been -/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the -/// ArrowError pointed to by the argument will be propagated with a -/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere -/// in the nanoarrow API. -/// -/// Except where documented, it is generally not safe to continue after a -/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and -/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use -/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms -/// for memory management and error propgagtion. -/// -/// @{ - -/// \brief Set the contents of an error using printf syntax. -/// -/// If error is NULL, this function does nothing and returns NANOARROW_OK. -NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet(struct ArrowError* error, - const char* fmt, ...); - -/// @} - -/// \defgroup nanoarrow-utils Utility data structures -/// -/// @{ - -/// \brief Return a version string in the form "major.minor.patch" -const char* ArrowNanoarrowVersion(void); - -/// \brief Return an integer that can be used to compare versions sequentially -int ArrowNanoarrowVersionInt(void); - -/// \brief Initialize a description of buffer arrangements from a storage type -void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); - -/// \brief Create a string view from a null-terminated string -static inline struct ArrowStringView ArrowCharView(const char* value); - -/// \brief Sets the integer value of an ArrowDecimal from a string -ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, - struct ArrowStringView value); - -/// \brief Get the integer value of an ArrowDecimal as string -ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, - struct ArrowBuffer* buffer); - -/// @} - -/// \defgroup nanoarrow-schema Creating schemas -/// -/// These functions allocate, copy, and destroy ArrowSchema structures -/// -/// @{ - -/// \brief Initialize an ArrowSchema -/// -/// Initializes the fields and release callback of schema_out. Caller -/// is responsible for calling the schema->release callback if -/// NANOARROW_OK is returned. -void ArrowSchemaInit(struct ArrowSchema* schema); - -/// \brief Initialize an ArrowSchema from an ArrowType -/// -/// A convenience constructor for that calls ArrowSchemaInit() and -/// ArrowSchemaSetType() for the common case of constructing an -/// unparameterized type. The caller is responsible for calling the schema->release -/// callback if NANOARROW_OK is returned. -ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); - -/// \brief Get a human-readable summary of a Schema -/// -/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) -/// and returns the number of characters required for the output if -/// n were sufficiently large. If recursive is non-zero, the result will -/// also include children. -int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, - char recursive); - -/// \brief Set the format field of a schema from an ArrowType -/// -/// Initializes the fields and release callback of schema_out. For -/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and -/// NANOARROW_TYPE_MAP, the appropriate number of children are -/// allocated, initialized, and named; however, the caller must -/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized -/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); - -/// \brief Set the format field and initialize children of a struct schema -/// -/// The specified number of children are initialized; however, the caller is responsible -/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. -/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); - -/// \brief Set the format field of a fixed-size schema -/// -/// Returns EINVAL for fixed_size <= 0 or for type that is not -/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. -/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are -/// allocated, initialized, and named; however, the caller must -/// ArrowSchemaSetType() the first child. Schema must have been initialized using -/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, - enum ArrowType type, int32_t fixed_size); - -/// \brief Set the format field of a decimal schema -/// -/// Returns EINVAL for scale <= 0 or for type that is not -/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been -/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, - int32_t decimal_precision, - int32_t decimal_scale); - -/// \brief Set the format field of a time, timestamp, or duration schema -/// -/// Returns EINVAL for type that is not -/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, -/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The -/// timezone parameter must be NULL for a non-timestamp type. Schema must have been -/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, - enum ArrowTimeUnit time_unit, - const char* timezone); - -/// \brief Seet the format field of a union schema -/// -/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION -/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are -/// allocated, and initialized. -ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, - int64_t n_children); - -/// \brief Make a (recursive) copy of a schema -/// -/// Allocates and copies fields of schema into schema_out. -ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, - struct ArrowSchema* schema_out); - -/// \brief Copy format into schema->format -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); - -/// \brief Copy name into schema->name -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); - -/// \brief Copy metadata into schema->metadata -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy. -ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); - -/// \brief Allocate the schema->children array -/// -/// Includes the memory for each child struct ArrowSchema. -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, - int64_t n_children); - -/// \brief Allocate the schema->dictionary member -/// -/// schema must have been allocated using ArrowSchemaInitFromType() or -/// ArrowSchemaDeepCopy(). -ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); - -/// @} - -/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata -/// -/// @{ - -/// \brief Reader for key/value pairs in schema metadata -/// -/// The ArrowMetadataReader does not own any data and is only valid -/// for the lifetime of the underlying metadata pointer. -struct ArrowMetadataReader { - /// \brief A metadata string from a schema->metadata field. - const char* metadata; - - /// \brief The current offset into the metadata string - int64_t offset; - - /// \brief The number of remaining keys - int32_t remaining_keys; -}; - -/// \brief Initialize an ArrowMetadataReader -ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, - const char* metadata); - -/// \brief Read the next key/value pair from an ArrowMetadataReader -ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, - struct ArrowStringView* key_out, - struct ArrowStringView* value_out); - -/// \brief The number of bytes in in a key/value metadata string -int64_t ArrowMetadataSizeOf(const char* metadata); - -/// \brief Check for a key in schema metadata -char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); - -/// \brief Extract a value from schema metadata -/// -/// If key does not exist in metadata, value_out is unmodified -ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, - struct ArrowStringView* value_out); - -/// \brief Initialize a builder for schema metadata from key/value pairs -/// -/// metadata can be an existing metadata string or NULL to initialize -/// an empty metadata string. -ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); - -/// \brief Append a key/value pair to a buffer containing serialized metadata -ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value); - -/// \brief Set a key/value pair to a buffer containing serialized metadata -/// -/// Ensures that the only entry for key in the metadata is set to value. -/// This function maintains the existing position of (the first instance of) -/// key if present in the data. -ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, - struct ArrowStringView key, - struct ArrowStringView value); - -/// \brief Remove a key from a buffer containing serialized metadata -ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, - struct ArrowStringView key); - -/// @} - -/// \defgroup nanoarrow-schema-view Reading schemas -/// -/// @{ - -/// \brief A non-owning view of a parsed ArrowSchema -/// -/// Contains more readily extractable values than a raw ArrowSchema. -/// Clients can stack or statically allocate this structure but are -/// encouraged to use the provided getters to ensure forward -/// compatibility. -struct ArrowSchemaView { - /// \brief A pointer to the schema represented by this view - const struct ArrowSchema* schema; - - /// \brief The data type represented by the schema - /// - /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a - /// non-null dictionary member; datetime types are valid values. - /// This value will never be NANOARROW_TYPE_EXTENSION (see - /// extension_name and/or extension_metadata to check for - /// an extension type). - enum ArrowType type; - - /// \brief The storage data type represented by the schema - /// - /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION - /// or any datetime type. This value represents only the type required to - /// interpret the buffers in the array. - enum ArrowType storage_type; - - /// \brief The storage layout represented by the schema - struct ArrowLayout layout; - - /// \brief The extension type name if it exists - /// - /// If the ARROW:extension:name key is present in schema.metadata, - /// extension_name.data will be non-NULL. - struct ArrowStringView extension_name; - - /// \brief The extension type metadata if it exists - /// - /// If the ARROW:extension:metadata key is present in schema.metadata, - /// extension_metadata.data will be non-NULL. - struct ArrowStringView extension_metadata; - - /// \brief Format fixed size parameter - /// - /// This value is set when parsing a fixed-size binary or fixed-size - /// list schema; this value is undefined for other types. For a - /// fixed-size binary schema this value is in bytes; for a fixed-size - /// list schema this value refers to the number of child elements for - /// each element of the parent. - int32_t fixed_size; - - /// \brief Decimal bitwidth - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_bitwidth; - - /// \brief Decimal precision - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_precision; - - /// \brief Decimal scale - /// - /// This value is set when parsing a decimal type schema; - /// this value is undefined for other types. - int32_t decimal_scale; - - /// \brief Format time unit parameter - /// - /// This value is set when parsing a date/time type. The value is - /// undefined for other types. - enum ArrowTimeUnit time_unit; - - /// \brief Format timezone parameter - /// - /// This value is set when parsing a timestamp type and represents - /// the timezone format parameter. This value points to - /// data within the schema and is undefined for other types. - const char* timezone; - - /// \brief Union type ids parameter - /// - /// This value is set when parsing a union type and represents - /// type ids parameter. This value points to - /// data within the schema and is undefined for other types. - const char* union_type_ids; -}; - -/// \brief Initialize an ArrowSchemaView -ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, - const struct ArrowSchema* schema, - struct ArrowError* error); - -/// @} - -/// \defgroup nanoarrow-buffer Owning, growable buffers -/// -/// @{ - -/// \brief Initialize an ArrowBuffer -/// -/// Initialize a buffer with a NULL, zero-size buffer using the default -/// buffer allocator. -static inline void ArrowBufferInit(struct ArrowBuffer* buffer); - -/// \brief Set a newly-initialized buffer's allocator -/// -/// Returns EINVAL if the buffer has already been allocated. -static inline ArrowErrorCode ArrowBufferSetAllocator( - struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); - -/// \brief Reset an ArrowBuffer -/// -/// Releases the buffer using the allocator's free method if -/// the buffer's data member is non-null, sets the data member -/// to NULL, and sets the buffer's size and capacity to 0. -static inline void ArrowBufferReset(struct ArrowBuffer* buffer); - -/// \brief Move an ArrowBuffer -/// -/// Transfers the buffer data and lifecycle management to another -/// address and resets buffer. -static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); - -/// \brief Grow or shrink a buffer to a given capacity -/// -/// When shrinking the capacity of the buffer, the buffer is only reallocated -/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not -/// adjust the buffer's size member except to ensure that the invariant -/// capacity >= size remains true. -static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, - char shrink_to_fit); - -/// \brief Ensure a buffer has at least a given additional capacity -/// -/// Ensures that the buffer has space to append at least -/// additional_size_bytes, overallocating when required. -static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, - int64_t additional_size_bytes); - -/// \brief Write data to buffer and increment the buffer size -/// -/// This function does not check that buffer has the required capacity -static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, - int64_t size_bytes); - -/// \brief Write data to buffer and increment the buffer size -/// -/// This function writes and ensures that the buffer has the required capacity, -/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will -/// overallocate when reallocation is required. -static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, - const void* data, int64_t size_bytes); - -/// \brief Write fill to buffer and increment the buffer size -/// -/// This function writes the specified number of fill bytes and -/// ensures that the buffer has the required capacity, -static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, - uint8_t value, int64_t size_bytes); - -/// \brief Write an 8-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, - int8_t value); - -/// \brief Write an unsigned 8-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, - uint8_t value); - -/// \brief Write a 16-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, - int16_t value); - -/// \brief Write an unsigned 16-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, - uint16_t value); - -/// \brief Write a 32-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, - int32_t value); - -/// \brief Write an unsigned 32-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, - uint32_t value); - -/// \brief Write a 64-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, - int64_t value); - -/// \brief Write an unsigned 64-bit integer to a buffer -static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, - uint64_t value); - -/// \brief Write a double to a buffer -static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, - double value); - -/// \brief Write a float to a buffer -static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, - float value); - -/// \brief Write an ArrowStringView to a buffer -static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, - struct ArrowStringView value); - -/// \brief Write an ArrowBufferView to a buffer -static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, - struct ArrowBufferView value); - -/// @} - -/// \defgroup nanoarrow-bitmap Bitmap utilities -/// -/// @{ - -/// \brief Extract a boolean value from a bitmap -static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); - -/// \brief Set a boolean value to a bitmap to true -static inline void ArrowBitSet(uint8_t* bits, int64_t i); - -/// \brief Set a boolean value to a bitmap to false -static inline void ArrowBitClear(uint8_t* bits, int64_t i); - -/// \brief Set a boolean value to a bitmap -static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); - -/// \brief Set a boolean value to a range in a bitmap -static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, - uint8_t bits_are_set); - -/// \brief Count true values in a bitmap -static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); - -/// \brief Extract int8 boolean values from a range in a bitmap -static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, - int64_t length, int8_t* out); - -/// \brief Extract int32 boolean values from a range in a bitmap -static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, - int64_t length, int32_t* out); - -/// \brief Initialize an ArrowBitmap -/// -/// Initialize the builder's buffer, empty its cache, and reset the size to zero -static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); - -/// \brief Move an ArrowBitmap -/// -/// Transfers the underlying buffer data and lifecycle management to another -/// address and resets the bitmap. -static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); - -/// \brief Ensure a bitmap builder has at least a given additional capacity -/// -/// Ensures that the buffer has space to append at least -/// additional_size_bits, overallocating when required. -static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, - int64_t additional_size_bits); - -/// \brief Grow or shrink a bitmap to a given capacity -/// -/// When shrinking the capacity of the bitmap, the bitmap is only reallocated -/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not -/// adjust the buffer's size member except when shrinking new_capacity_bits -/// to a value less than the current number of bits in the bitmap. -static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit); - -/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap -static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length); - -/// \brief Append zero or more of the same boolean value to a bitmap -static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length); - -/// \brief Append boolean values encoded as int8_t to a bitmap -/// -/// The values must all be 0 or 1. -static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - const int8_t* values, int64_t n_values); - -/// \brief Append boolean values encoded as int32_t to a bitmap -/// -/// The values must all be 0 or 1. -static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, - const int32_t* values, int64_t n_values); - -/// \brief Reset a bitmap builder -/// -/// Releases any memory held by buffer, empties the cache, and resets the size to zero -static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); - -/// @} - -/// \defgroup nanoarrow-array Creating arrays -/// -/// These functions allocate, copy, and destroy ArrowArray structures. -/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() -/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing -/// it using the embedded release callback. -/// -/// @{ - -/// \brief Initialize the fields of an array -/// -/// Initializes the fields and release callback of array. Caller -/// is responsible for calling the array->release callback if -/// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, - enum ArrowType storage_type); - -/// \brief Initialize the contents of an ArrowArray from an ArrowSchema -/// -/// Caller is responsible for calling the array->release callback if -/// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, - const struct ArrowSchema* schema, - struct ArrowError* error); - -/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView -/// -/// Caller is responsible for calling the array->release callback if -/// NANOARROW_OK is returned. -ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, - const struct ArrowArrayView* array_view, - struct ArrowError* error); - -/// \brief Allocate the array->children array -/// -/// Includes the memory for each child struct ArrowArray, -/// whose members are marked as released and may be subsequently initialized -/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. -/// schema must have been allocated using ArrowArrayInitFromType(). -ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); - -/// \brief Allocate the array->dictionary member -/// -/// Includes the memory for the struct ArrowArray, whose contents -/// is marked as released and may be subsequently initialized -/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. -/// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); - -/// \brief Set the validity bitmap of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); - -/// \brief Set a buffer of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, - struct ArrowBuffer* buffer); - -/// \brief Get the validity bitmap of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); - -/// \brief Get a buffer of an ArrowArray -/// -/// array must have been allocated using ArrowArrayInitFromType() -static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); - -/// \brief Start element-wise appending to an ArrowArray -/// -/// Initializes any values needed to use ArrowArrayAppend*() functions. -/// All element-wise appenders append by value and return EINVAL if the exact value -/// cannot be represented by the underlying storage type. -/// array must have been allocated using ArrowArrayInitFromType() -static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); - -/// \brief Reserve space for future appends -/// -/// For buffer sizes that can be calculated (i.e., not string data buffers or -/// child array sizes for non-fixed-size arrays), recursively reserve space for -/// additional elements. This is useful for reducing the number of reallocations -/// that occur using the item-wise appenders. -ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, - int64_t additional_size_elements); - -/// \brief Append a null value to an array -static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); - -/// \brief Append an empty, non-null value to an array -static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); - -/// \brief Append a signed integer value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., value -/// is outside the valid array range). -static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); - -/// \brief Append an unsigned integer value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., value -/// is outside the valid array range). -static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, - uint64_t value); - -/// \brief Append a double value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise (e.g., value -/// is outside the valid array range or there is an attempt to append -/// a non-integer to an array with an integer storage type). -static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, - double value); - -/// \brief Append a string of bytes to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type, EOVERFLOW if appending value would overflow -/// the offset type (e.g., if the data buffer would be larger than 2 GB for a -/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a -/// binary, string, large binary, large string, or fixed-size binary array, or value is -/// the wrong size for a fixed-size binary array). -static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, - struct ArrowBufferView value); - -/// \brief Append a string value to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type, EOVERFLOW if appending value would overflow -/// the offset type (e.g., if the data buffer would be larger than 2 GB for a -/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a -/// string or large string array). -static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, - struct ArrowStringView value); - -/// \brief Append a Interval to an array -/// -/// Returns NANOARROW_OK if value can be exactly represented by -/// the underlying storage type or EINVAL otherwise. -static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, - const struct ArrowInterval* value); - -/// \brief Append a decimal value to an array -/// -/// Returns NANOARROW_OK if array is a decimal array with the appropriate -/// bitwidth or EINVAL otherwise. -static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - const struct ArrowDecimal* value); - -/// \brief Finish a nested array element -/// -/// Appends a non-null element to the array based on the first child's current -/// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW -/// if the child of a list or map array would exceed INT_MAX elements, or EINVAL -/// if the underlying storage type is not a struct, list, large list, or fixed-size -/// list, or if there was an attempt to add a struct or fixed-size list element where the -/// length of the child array(s) did not match the expected length. -static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); - -/// \brief Finish a union array element -/// -/// Appends an element to the union type ids buffer and increments array->length. -/// For sparse unions, up to one element is added to non type-id children. Returns -/// EINVAL if the underlying storage type is not a union, if type_id is not valid, -/// or if child sizes after appending are inconsistent. -static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, - int8_t type_id); - -/// \brief Shrink buffer capacity to the size required -/// -/// Also applies shrinking to any child arrays. array must have been allocated using -/// ArrowArrayInitFromType -static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); - -/// \brief Finish building an ArrowArray -/// -/// Flushes any pointers from internal buffers that may have been reallocated -/// into array->buffers and checks the actual size of the buffers -/// against the expected size based on the final length. -/// array must have been allocated using ArrowArrayInitFromType() -ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, - struct ArrowError* error); - -/// \brief Finish building an ArrowArray with explicit validation -/// -/// Finish building with an explicit validation level. This could perform less validation -/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU -/// buffer data access is not possible or more validation (i.e., -/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or -/// corruptible source. -ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, - enum ArrowValidationLevel validation_level, - struct ArrowError* error); - -/// @} - -/// \defgroup nanoarrow-array-view Reading arrays -/// -/// These functions read and validate the contents ArrowArray structures. -/// -/// @{ - -/// \brief Initialize the contents of an ArrowArrayView -void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, - enum ArrowType storage_type); - -/// \brief Move an ArrowArrayView -/// -/// Transfers the ArrowArrayView data and lifecycle management to another -/// address and resets the contents of src. -static inline void ArrowArrayViewMove(struct ArrowArrayView* src, - struct ArrowArrayView* dst); - -/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema -ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, - const struct ArrowSchema* schema, - struct ArrowError* error); - -/// \brief Allocate the array_view->children array -/// -/// Includes the memory for each child struct ArrowArrayView -ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, - int64_t n_children); - -/// \brief Allocate array_view->dictionary -ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); - -/// \brief Set data-independent buffer sizes from length -void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); - -/// \brief Set buffer sizes and data pointers from an ArrowArray -ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error); - -/// \brief Set buffer sizes and data pointers from an ArrowArray except for those -/// that require dereferencing buffer content. -ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, - const struct ArrowArray* array, - struct ArrowError* error); - -/// \brief Performs checks on the content of an ArrowArrayView -/// -/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, -/// the buffer sizes and some content (fist and last offset) have already -/// been validated at the "default" level. If setting the buffer pointers -/// and sizes otherwise, you may wish to perform checks at a different level. See -/// documentation for ArrowValidationLevel for the details of checks performed -/// at each level. -ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, - enum ArrowValidationLevel validation_level, - struct ArrowError* error); - -/// \brief Reset the contents of an ArrowArrayView and frees resources -void ArrowArrayViewReset(struct ArrowArrayView* array_view); - -/// \brief Check for a null element in an ArrowArrayView -static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get the type id of a union array element -static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get the child index of a union array element -static inline int8_t ArrowArrayViewUnionChildIndex( - const struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get the index to use into the relevant union child array -static inline int64_t ArrowArrayViewUnionChildOffset( - const struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as an integer -/// -/// This function does not check for null values, that values are actually integers, or -/// that values are within a valid range for an int64. -static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, - int64_t i); - -/// \brief Get an element in an ArrowArrayView as an unsigned integer -/// -/// This function does not check for null values, that values are actually integers, or -/// that values are within a valid range for a uint64. -static inline uint64_t ArrowArrayViewGetUIntUnsafe( - const struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as a double -/// -/// This function does not check for null values, or -/// that values are within a valid range for a double. -static inline double ArrowArrayViewGetDoubleUnsafe( - const struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as an ArrowStringView -/// -/// This function does not check for null values. -static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( - const struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as an ArrowBufferView -/// -/// This function does not check for null values. -static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( - const struct ArrowArrayView* array_view, int64_t i); - -/// \brief Get an element in an ArrowArrayView as an ArrowDecimal -/// -/// This function does not check for null values. The out parameter must -/// be initialized with ArrowDecimalInit() with the proper parameters for this -/// type before calling this for the first time. -static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, - int64_t i, struct ArrowDecimal* out); - -/// @} - -/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation -/// -/// An implementation of an ArrowArrayStream based on a collection of -/// zero or more previously-existing ArrowArray objects. Users should -/// initialize and/or validate the contents before transferring the -/// responsibility of the ArrowArrayStream elsewhere. -/// -/// @{ - -/// \brief Initialize an ArrowArrayStream backed by this implementation -/// -/// This function moves the ownership of schema to the array_stream. If -/// this function returns NANOARROW_OK, the caller is responsible for -/// releasing the ArrowArrayStream. -ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, - struct ArrowSchema* schema, int64_t n_arrays); - -/// \brief Set the ith ArrowArray in this ArrowArrayStream. -/// -/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). -/// This function move the ownership of array to the array_stream. i must -/// be greater than zero and less than the value of n_arrays passed in -/// ArrowBasicArrayStreamInit(). Callers are not required to fill all -/// n_arrays members (i.e., n_arrays is a maximum bound). -void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, - struct ArrowArray* array); - -/// \brief Validate the contents of this ArrowArrayStream -/// -/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). -/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() -/// to validate the contents of the arrays. -ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, - struct ArrowError* error); - -/// @} - -// Undefine ArrowErrorCode, which may have been defined to annotate functions that return -// it to warn for an unused result. -#if defined(ArrowErrorCode) -#undef ArrowErrorCode -#endif - -// Inline function definitions - - - -#ifdef __cplusplus -} -#endif - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED -#define NANOARROW_BUFFER_INLINE_H_INCLUDED - -#include -#include -#include - - - -#ifdef __cplusplus -extern "C" { -#endif - -static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { - int64_t doubled_capacity = current_capacity * 2; - if (doubled_capacity > new_capacity) { - return doubled_capacity; - } else { - return new_capacity; - } -} - -static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { - buffer->data = NULL; - buffer->size_bytes = 0; - buffer->capacity_bytes = 0; - buffer->allocator = ArrowBufferAllocatorDefault(); -} - -static inline ArrowErrorCode ArrowBufferSetAllocator( - struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { - if (buffer->data == NULL) { - buffer->allocator = allocator; - return NANOARROW_OK; - } else { - return EINVAL; - } -} - -static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { - if (buffer->data != NULL) { - buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, - buffer->capacity_bytes); - buffer->data = NULL; - } - - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; -} - -static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { - memcpy(dst, src, sizeof(struct ArrowBuffer)); - src->data = NULL; - ArrowBufferReset(src); -} - -static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, - int64_t new_capacity_bytes, - char shrink_to_fit) { - if (new_capacity_bytes < 0) { - return EINVAL; - } - - if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { - buffer->data = buffer->allocator.reallocate( - &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); - if (buffer->data == NULL && new_capacity_bytes > 0) { - buffer->capacity_bytes = 0; - buffer->size_bytes = 0; - return ENOMEM; - } - - buffer->capacity_bytes = new_capacity_bytes; - } - - // Ensures that when shrinking that size <= capacity - if (new_capacity_bytes < buffer->size_bytes) { - buffer->size_bytes = new_capacity_bytes; - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, - int64_t additional_size_bytes) { - int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; - if (min_capacity_bytes <= buffer->capacity_bytes) { - return NANOARROW_OK; - } - - return ArrowBufferResize( - buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); -} - -static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, - int64_t size_bytes) { - if (size_bytes > 0) { - memcpy(buffer->data + buffer->size_bytes, data, size_bytes); - buffer->size_bytes += size_bytes; - } -} - -static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, - const void* data, int64_t size_bytes) { - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); - - ArrowBufferAppendUnsafe(buffer, data, size_bytes); - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, - int8_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, - uint8_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, - int16_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, - uint16_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, - int32_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, - uint32_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, - int64_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, - uint64_t value) { - return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); -} - -static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, - double value) { - return ArrowBufferAppend(buffer, &value, sizeof(double)); -} - -static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, - float value) { - return ArrowBufferAppend(buffer, &value, sizeof(float)); -} - -static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, - struct ArrowStringView value) { - return ArrowBufferAppend(buffer, value.data, value.size_bytes); -} - -static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, - struct ArrowBufferView value) { - return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); -} - -static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, - uint8_t value, int64_t size_bytes) { - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); - - memset(buffer->data + buffer->size_bytes, value, size_bytes); - buffer->size_bytes += size_bytes; - return NANOARROW_OK; -} - -static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; -static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; -static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; -static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; - -static const uint8_t _ArrowkBytePopcount[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, - 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, - 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, - 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, - 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, - 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, - 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, - 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; - -static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { - return (value + 7) & ~((int64_t)7); -} - -static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { - return (value / 8) * 8; -} - -static inline int64_t _ArrowBytesForBits(int64_t bits) { - return (bits >> 3) + ((bits & 7) != 0); -} - -static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { - out[0] = (word & 0x1) != 0; - out[1] = (word & 0x2) != 0; - out[2] = (word & 0x4) != 0; - out[3] = (word & 0x8) != 0; - out[4] = (word & 0x10) != 0; - out[5] = (word & 0x20) != 0; - out[6] = (word & 0x40) != 0; - out[7] = (word & 0x80) != 0; -} - -static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - out[0] = (word & 0x1) != 0; - out[1] = (word & 0x2) != 0; - out[2] = (word & 0x4) != 0; - out[3] = (word & 0x8) != 0; - out[4] = (word & 0x10) != 0; - out[5] = (word & 0x20) != 0; - out[6] = (word & 0x40) != 0; - out[7] = (word & 0x80) != 0; -} - -static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { - *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -} - -static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { - *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -} - -static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { - return (bits[i >> 3] >> (i & 0x07)) & 1; -} - -static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, - int64_t length, int8_t* out) { - if (length == 0) { - return; - } - - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const int64_t i_last_valid = i_end - 1; - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_last_valid = i_last_valid / 8; - - if (bytes_begin == bytes_last_valid) { - for (int i = 0; i < length; i++) { - out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } - - return; - } - - // first byte - for (int i = 0; i < 8 - (i_begin % 8); i++) { - *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } - - // middle bytes - for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { - _ArrowBitsUnpackInt8(bits[i], out); - out += 8; - } - - // last byte - const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); - for (int i = 0; i < bits_remaining; i++) { - *out++ = ArrowBitGet(&bits[bytes_last_valid], i); - } -} - -static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, - int64_t length, int32_t* out) { - if (length == 0) { - return; - } - - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const int64_t i_last_valid = i_end - 1; - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_last_valid = i_last_valid / 8; - - if (bytes_begin == bytes_last_valid) { - for (int i = 0; i < length; i++) { - out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } - - return; - } - - // first byte - for (int i = 0; i < 8 - (i_begin % 8); i++) { - *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); - } - - // middle bytes - for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { - _ArrowBitsUnpackInt32(bits[i], out); - out += 8; - } - - // last byte - const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); - for (int i = 0; i < bits_remaining; i++) { - *out++ = ArrowBitGet(&bits[bytes_last_valid], i); - } -} - -static inline void ArrowBitSet(uint8_t* bits, int64_t i) { - bits[i / 8] |= _ArrowkBitmask[i % 8]; -} - -static inline void ArrowBitClear(uint8_t* bits, int64_t i) { - bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; -} - -static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { - bits[i / 8] ^= - ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; -} - -static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, - uint8_t bits_are_set) { - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const uint8_t fill_byte = (uint8_t)(-bits_are_set); - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_end = i_end / 8 + 1; - - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; - - if (bytes_end == bytes_begin + 1) { - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); - bits[bytes_begin] &= only_byte_mask; - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte - bits[bytes_begin] &= first_byte_mask; - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { - // set/clear whole bytes - memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); - } - - if (i_end % 8 == 0) { - return; - } - - // set/clear leading bits of last byte - bits[bytes_end - 1] &= last_byte_mask; - bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); -} - -static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, - int64_t length) { - if (length == 0) { - return 0; - } - - const int64_t i_begin = start_offset; - const int64_t i_end = start_offset + length; - const int64_t i_last_valid = i_end - 1; - - const int64_t bytes_begin = i_begin / 8; - const int64_t bytes_last_valid = i_last_valid / 8; - - if (bytes_begin == bytes_last_valid) { - // count bits within a single byte - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; - const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; - - const uint8_t only_byte_mask = - i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); - - const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; - return _ArrowkBytePopcount[byte_masked]; - } - - const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; - const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; - int64_t count = 0; - - // first byte - count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; - - // middle bytes - for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { - count += _ArrowkBytePopcount[bits[i]]; - } - - // last byte - count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; - - return count; -} - -static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { - ArrowBufferInit(&bitmap->buffer); - bitmap->size_bits = 0; -} - -static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { - ArrowBufferMove(&src->buffer, &dst->buffer); - dst->size_bits = src->size_bits; - src->size_bits = 0; -} - -static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, - int64_t additional_size_bits) { - int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; - if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { - return NANOARROW_OK; - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); - - bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, - int64_t new_capacity_bits, - char shrink_to_fit) { - if (new_capacity_bits < 0) { - return EINVAL; - } - - int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); - NANOARROW_RETURN_NOT_OK( - ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); - - if (new_capacity_bits < bitmap->size_bits) { - bitmap->size_bits = new_capacity_bits; - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); - - ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); - return NANOARROW_OK; -} - -static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, - uint8_t bits_are_set, int64_t length) { - ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); - bitmap->size_bits += length; - bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); -} - -static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - const int8_t* values, int64_t n_values) { - if (n_values == 0) { - return; - } - - const int8_t* values_cursor = values; - int64_t n_remaining = n_values; - int64_t out_i_cursor = bitmap->size_bits; - uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; - - // First byte - if ((out_i_cursor % 8) != 0) { - int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; - for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); - } - - out_cursor++; - values_cursor += n_partial_bits; - n_remaining -= n_partial_bits; - } - - // Middle bytes - int64_t n_full_bytes = n_remaining / 8; - for (int64_t i = 0; i < n_full_bytes; i++) { - _ArrowBitmapPackInt8(values_cursor, out_cursor); - values_cursor += 8; - out_cursor++; - } - - // Last byte - out_i_cursor += n_full_bytes * 8; - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte - *out_cursor = 0x00; - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } - out_cursor++; - } - - bitmap->size_bits += n_values; - bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; -} - -static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, - const int32_t* values, int64_t n_values) { - if (n_values == 0) { - return; - } - - const int32_t* values_cursor = values; - int64_t n_remaining = n_values; - int64_t out_i_cursor = bitmap->size_bits; - uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; - - // First byte - if ((out_i_cursor % 8) != 0) { - int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; - for (int i = 0; i < n_partial_bits; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); - } - - out_cursor++; - values_cursor += n_partial_bits; - n_remaining -= n_partial_bits; - } - - // Middle bytes - int64_t n_full_bytes = n_remaining / 8; - for (int64_t i = 0; i < n_full_bytes; i++) { - _ArrowBitmapPackInt32(values_cursor, out_cursor); - values_cursor += 8; - out_cursor++; - } - - // Last byte - out_i_cursor += n_full_bytes * 8; - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte - *out_cursor = 0x00; - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); - } - out_cursor++; - } - - bitmap->size_bits += n_values; - bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; -} - -static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { - ArrowBufferReset(&bitmap->buffer); - bitmap->size_bits = 0; -} - -#ifdef __cplusplus -} -#endif - -#endif -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED -#define NANOARROW_ARRAY_INLINE_H_INCLUDED - -#include -#include -#include -#include -#include - - - - -#ifdef __cplusplus -extern "C" { -#endif - -static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - return &private_data->bitmap; -} - -static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - switch (i) { - case 0: - return &private_data->bitmap.buffer; - default: - return private_data->buffers + i - 1; - } -} - -// We don't currently support the case of unions where type_id != child_index; -// however, these functions are used to keep track of where that assumption -// is made. -static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, - int8_t type_id) { - NANOARROW_UNUSED(array); - return type_id; -} - -static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, - int8_t child_index) { - NANOARROW_UNUSED(array); - return child_index; -} - -static inline int32_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { - if (*type_ids == '\0') { - return 0; - } - - int32_t i = 0; - long type_id; - char* end_ptr; - do { - type_id = strtol(type_ids, &end_ptr, 10); - if (end_ptr == type_ids || type_id < 0 || type_id > 127) { - return -1; - } - - if (out != NULL) { - out[i] = (int8_t)type_id; - } - - i++; - - type_ids = end_ptr; - if (*type_ids == '\0') { - return i; - } else if (*type_ids != ',') { - return -1; - } else { - type_ids++; - } - } while (1); - - return -1; -} - -static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, - int64_t n_type_ids, - int64_t n_children) { - if (n_type_ids != n_children) { - return 0; - } - - for (int8_t i = 0; i < n_type_ids; i++) { - if (type_ids[i] != i) { - return 0; - } - } - - return 1; -} - -static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, - int64_t n_children) { - int8_t type_ids[128]; - int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); - return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); -} - -static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_UNINITIALIZED: - return EINVAL; - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_DENSE_UNION: - // Note that this value could be -1 if the type_ids string was invalid - if (private_data->union_type_id_is_child_index != 1) { - return EINVAL; - } else { - break; - } - default: - break; - } - if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { - return EINVAL; - } - - // Initialize any data offset buffer with a single zero - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && - private_data->layout.element_size_bits[i] == 64) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); - } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && - private_data->layout.element_size_bits[i] == 32) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); - } - } - - // Start building any child arrays or dictionaries - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); - } - - if (array->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { - for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); - NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); - } - - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); - } - - if (array->dictionary != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); - } - - return NANOARROW_OK; -} - -static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, - int64_t buffer_i, uint8_t value, - int64_t n) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); - int64_t bytes_required = - _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * - (array->length + 1)) / - 8; - if (bytes_required > buffer->size_bytes) { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); - } - - ArrowBitsSetTo(buffer->data, array->length, n, value); - return NANOARROW_OK; -} - -static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, - int64_t n, uint8_t is_valid) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - if (n == 0) { - return NANOARROW_OK; - } - - // Some type-specific handling - switch (private_data->storage_type) { - case NANOARROW_TYPE_NA: - // (An empty value for a null array *is* a null) - array->null_count += n; - array->length += n; - return NANOARROW_OK; - - case NANOARROW_TYPE_DENSE_UNION: { - // Add one null to the first child and append n references to that child - int8_t type_id = _ArrowArrayUnionTypeId(array, 0); - NANOARROW_RETURN_NOT_OK( - _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); - for (int64_t i = 0; i < n; i++) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( - ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); - } - // For the purposes of array->null_count, union elements are never considered "null" - // even if some children contain nulls. - array->length += n; - return NANOARROW_OK; - } - - case NANOARROW_TYPE_SPARSE_UNION: { - // Add n nulls to the first child and append n references to that child - int8_t type_id = _ArrowArrayUnionTypeId(array, 0); - NANOARROW_RETURN_NOT_OK( - _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); - for (int64_t i = 1; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); - // For the purposes of array->null_count, union elements are never considered "null" - // even if some children contain nulls. - array->length += n; - return NANOARROW_OK; - } - - case NANOARROW_TYPE_FIXED_SIZE_LIST: - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( - array->children[0], n * private_data->layout.child_size_elements)); - break; - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array->n_children; i++) { - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); - } - break; - - default: - break; - } - - // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet - // and we need to append nulls, do it now. - if (!is_valid && private_data->bitmap.buffer.data == NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); - ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); - ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); - } else if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); - ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); - } - - // Add appropriate buffer fill - struct ArrowBuffer* buffer; - int64_t size_bytes; - - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - buffer = ArrowArrayBuffer(array, i); - size_bytes = private_data->layout.element_size_bits[i] / 8; - - switch (private_data->layout.buffer_type[i]) { - case NANOARROW_BUFFER_TYPE_NONE: - case NANOARROW_BUFFER_TYPE_VALIDITY: - continue; - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: - // Append the current value at the end of the offset buffer for each element - NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); - - for (int64_t j = 0; j < n; j++) { - ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), - size_bytes); - } - - // Skip the data buffer - i++; - continue; - case NANOARROW_BUFFER_TYPE_DATA: - // Zero out the next bit of memory - if (private_data->layout.element_size_bits[i] % 8 == 0) { - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); - } else { - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); - } - continue; - - case NANOARROW_BUFFER_TYPE_TYPE_ID: - case NANOARROW_BUFFER_TYPE_UNION_OFFSET: - // These cases return above - return EINVAL; - } - } - - array->length += n; - array->null_count += n * !is_valid; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { - return _ArrowArrayAppendEmptyInternal(array, n, 0); -} - -static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { - return _ArrowArrayAppendEmptyInternal(array, n, 1); -} - -static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, - int64_t value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_INT64: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); - break; - case NANOARROW_TYPE_INT32: - _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); - break; - case NANOARROW_TYPE_INT16: - _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); - break; - case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); - break; - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); - return ArrowArrayAppendUInt(array, value); - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - case NANOARROW_TYPE_BOOL: - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, - uint64_t value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_UINT64: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); - break; - case NANOARROW_TYPE_UINT32: - _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); - break; - case NANOARROW_TYPE_UINT16: - _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); - break; - case NANOARROW_TYPE_UINT8: - _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); - break; - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_INT8: - _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); - return ArrowArrayAppendInt(array, value); - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - case NANOARROW_TYPE_BOOL: - NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, - double value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DOUBLE: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); - break; - case NANOARROW_TYPE_FLOAT: - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, - struct ArrowBufferView value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); - struct ArrowBuffer* data_buffer = ArrowArrayBuffer( - array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); - int32_t offset; - int64_t large_offset; - int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - offset = ((int32_t*)offset_buffer->data)[array->length]; - if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { - return EOVERFLOW; - } - - offset += (int32_t)value.size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - large_offset = ((int64_t*)offset_buffer->data)[array->length]; - large_offset += value.size_bytes; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - if (value.size_bytes != fixed_size_bytes) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, - struct ArrowStringView value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBufferView buffer_view; - buffer_view.data.data = value.data; - buffer_view.size_bytes = value.size_bytes; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_LARGE_BINARY: - return ArrowArrayAppendBytes(array, buffer_view); - default: - return EINVAL; - } -} - -static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, - const struct ArrowInterval* value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_INTERVAL_MONTHS: { - if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); - break; - } - case NANOARROW_TYPE_INTERVAL_DAY_TIME: { - if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); - break; - } - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { - if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); - break; - } - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, - const struct ArrowDecimal* value) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DECIMAL128: - if (value->n_words != 2) { - return EINVAL; - } else { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); - break; - } - case NANOARROW_TYPE_DECIMAL256: - if (value->n_words != 4) { - return EINVAL; - } else { - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); - break; - } - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - int64_t child_length; - - switch (private_data->storage_type) { - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_MAP: - child_length = array->children[0]->length; - if (child_length > INT32_MAX) { - return EOVERFLOW; - } - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); - break; - case NANOARROW_TYPE_LARGE_LIST: - child_length = array->children[0]->length; - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); - break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - child_length = array->children[0]->length; - if (child_length != - ((array->length + 1) * private_data->layout.child_size_elements)) { - return EINVAL; - } - break; - case NANOARROW_TYPE_STRUCT: - for (int64_t i = 0; i < array->n_children; i++) { - child_length = array->children[i]->length; - if (child_length != (array->length + 1)) { - return EINVAL; - } - } - break; - default: - return EINVAL; - } - - if (private_data->bitmap.buffer.data != NULL) { - NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); - } - - array->length++; - return NANOARROW_OK; -} - -static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, - int8_t type_id) { - struct ArrowArrayPrivateData* private_data = - (struct ArrowArrayPrivateData*)array->private_data; - - int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); - if (child_index < 0 || child_index >= array->n_children) { - return EINVAL; - } - - switch (private_data->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - // Append the target child length to the union offsets buffer - _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); - NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( - ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); - break; - case NANOARROW_TYPE_SPARSE_UNION: - // Append one empty to any non-target column that isn't already the right length - // or abort if appending a null will result in a column with invalid length - for (int64_t i = 0; i < array->n_children; i++) { - if (i == child_index || array->children[i]->length == (array->length + 1)) { - continue; - } - - if (array->children[i]->length != array->length) { - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); - } - - break; - default: - return EINVAL; - } - - // Write to the type_ids buffer - NANOARROW_RETURN_NOT_OK( - ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); - array->length++; - return NANOARROW_OK; -} - -static inline void ArrowArrayViewMove(struct ArrowArrayView* src, - struct ArrowArrayView* dst) { - memcpy(dst, src, sizeof(struct ArrowArrayView)); - ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); -} - -static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, - int64_t i) { - const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; - i += array_view->offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_NA: - return 0x01; - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - // Unions are "never null" in Arrow land - return 0x00; - default: - return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); - } -} - -static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, - int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - return array_view->buffer_views[0].data.as_int8[i]; - default: - return -1; - } -} - -static inline int8_t ArrowArrayViewUnionChildIndex( - const struct ArrowArrayView* array_view, int64_t i) { - int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); - if (array_view->union_type_id_map == NULL) { - return type_id; - } else { - return array_view->union_type_id_map[type_id]; - } -} - -static inline int64_t ArrowArrayViewUnionChildOffset( - const struct ArrowArrayView* array_view, int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_DENSE_UNION: - return array_view->buffer_views[1].data.as_int32[i]; - case NANOARROW_TYPE_SPARSE_UNION: - return i; - default: - return -1; - } -} - -static inline int64_t ArrowArrayViewListChildOffset( - const struct ArrowArrayView* array_view, int64_t i) { - switch (array_view->storage_type) { - case NANOARROW_TYPE_LIST: - return array_view->buffer_views[1].data.as_int32[i]; - case NANOARROW_TYPE_LARGE_LIST: - return array_view->buffer_views[1].data.as_int64[i]; - default: - return -1; - } -} - -static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, - int64_t i) { - const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - i += array_view->offset; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INTERVAL_MONTHS: - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return (int64_t)data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return (int64_t)data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return INT64_MAX; - } -} - -static inline uint64_t ArrowArrayViewGetUIntUnsafe( - const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INTERVAL_MONTHS: - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return (uint64_t)data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return (uint64_t)data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return UINT64_MAX; - } -} - -static inline double ArrowArrayViewGetDoubleUnsafe( - const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INT64: - return (double)data_view->data.as_int64[i]; - case NANOARROW_TYPE_UINT64: - return (double)data_view->data.as_uint64[i]; - case NANOARROW_TYPE_INT32: - return data_view->data.as_int32[i]; - case NANOARROW_TYPE_UINT32: - return data_view->data.as_uint32[i]; - case NANOARROW_TYPE_INT16: - return data_view->data.as_int16[i]; - case NANOARROW_TYPE_UINT16: - return data_view->data.as_uint16[i]; - case NANOARROW_TYPE_INT8: - return data_view->data.as_int8[i]; - case NANOARROW_TYPE_UINT8: - return data_view->data.as_uint8[i]; - case NANOARROW_TYPE_DOUBLE: - return data_view->data.as_double[i]; - case NANOARROW_TYPE_FLOAT: - return data_view->data.as_float[i]; - case NANOARROW_TYPE_BOOL: - return ArrowBitGet(data_view->data.as_uint8, i); - default: - return DBL_MAX; - } -} - -static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( - const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; - const char* data_view = array_view->buffer_views[2].data.as_char; - - struct ArrowStringView view; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - view.data = data_view + offsets_view->data.as_int32[i]; - view.size_bytes = - offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; - break; - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - view.data = data_view + offsets_view->data.as_int64[i]; - view.size_bytes = - offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - view.size_bytes = array_view->layout.element_size_bits[1] / 8; - view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); - break; - default: - view.data = NULL; - view.size_bytes = 0; - break; - } - - return view; -} - -static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( - const struct ArrowArrayView* array_view, int64_t i) { - i += array_view->offset; - const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; - const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; - - struct ArrowBufferView view; - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_BINARY: - view.size_bytes = - offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; - view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; - break; - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_LARGE_BINARY: - view.size_bytes = - offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; - view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - view.size_bytes = array_view->layout.element_size_bits[1] / 8; - view.data.as_uint8 = - array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); - break; - default: - view.data.data = NULL; - view.size_bytes = 0; - break; - } - - return view; -} - -static inline void ArrowArrayViewGetIntervalUnsafe( - const struct ArrowArrayView* array_view, int64_t i, struct ArrowInterval* out) { - const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; - switch (array_view->storage_type) { - case NANOARROW_TYPE_INTERVAL_MONTHS: { - const size_t size = sizeof(int32_t); - memcpy(&out->months, data_view + i * size, sizeof(int32_t)); - break; - } - case NANOARROW_TYPE_INTERVAL_DAY_TIME: { - const size_t size = sizeof(int32_t) + sizeof(int32_t); - memcpy(&out->days, data_view + i * size, sizeof(int32_t)); - memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); - break; - } - case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { - const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); - memcpy(&out->months, data_view + i * size, sizeof(int32_t)); - memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); - memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); - break; - } - default: - break; - } -} - -static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, - int64_t i, struct ArrowDecimal* out) { - i += array_view->offset; - const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; - switch (array_view->storage_type) { - case NANOARROW_TYPE_DECIMAL128: - ArrowDecimalSetBytes(out, data_view + (i * 16)); - break; - case NANOARROW_TYPE_DECIMAL256: - ArrowDecimalSetBytes(out, data_view + (i * 32)); - break; - default: - memset(out->words, 0, sizeof(out->words)); - break; - } -} - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/apis/r/src/nanoarrow.hpp b/apis/r/src/nanoarrow.hpp deleted file mode 100644 index 8d5b841e28..0000000000 --- a/apis/r/src/nanoarrow.hpp +++ /dev/null @@ -1,501 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include "nanoarrow.h" - -#ifndef NANOARROW_HPP_INCLUDED -#define NANOARROW_HPP_INCLUDED - -/// \defgroup nanoarrow_hpp Nanoarrow C++ Helpers -/// -/// The utilities provided in this file are intended to support C++ users -/// of the nanoarrow C library such that C++-style resource allocation -/// and error handling can be used with nanoarrow data structures. -/// These utilities are not intended to mirror the nanoarrow C API. - -namespace nanoarrow { - -/// \defgroup nanoarrow_hpp-errors Error handling helpers -/// -/// Most functions in the C API return an ArrowErrorCode to communicate -/// possible failure. Except where documented, it is usually not safe to -/// continue after a non-zero value has been returned. While the -/// nanoarrow C++ helpers do not throw any exceptions of their own, -/// these helpers are provided to facilitate using the nanoarrow C++ helpers -/// in frameworks where this is a useful error handling idiom. -/// -/// @{ - -class Exception : public std::exception { - public: - Exception(const std::string& msg) : msg_(msg) {} - const char* what() const noexcept { return msg_.c_str(); } - - private: - std::string msg_; -}; - -#if defined(NANOARROW_DEBUG) -#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - throw nanoarrow::Exception( \ - std::string(EXPR_STR) + std::string(" failed with errno ") + \ - std::to_string(NAME) + std::string("\n * ") + std::string(__FILE__) + \ - std::string(":") + std::to_string(__LINE__) + std::string("\n")); \ - } \ - } while (0) -#else -#define _NANOARROW_THROW_NOT_OK_IMPL(NAME, EXPR, EXPR_STR) \ - do { \ - const int NAME = (EXPR); \ - if (NAME) { \ - throw nanoarrow::Exception(std::string(EXPR_STR) + \ - std::string(" failed with errno ") + \ - std::to_string(NAME)); \ - } \ - } while (0) -#endif - -#define NANOARROW_THROW_NOT_OK(EXPR) \ - _NANOARROW_THROW_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, \ - #EXPR) - -/// @} - -namespace internal { - -/// \defgroup nanoarrow_hpp-unique_base Base classes for Unique wrappers -/// -/// @{ - -template -static inline void init_pointer(T* data); - -template -static inline void move_pointer(T* src, T* dst); - -template -static inline void release_pointer(T* data); - -template <> -inline void init_pointer(struct ArrowSchema* data) { - data->release = nullptr; -} - -template <> -inline void move_pointer(struct ArrowSchema* src, struct ArrowSchema* dst) { - ArrowSchemaMove(src, dst); -} - -template <> -inline void release_pointer(struct ArrowSchema* data) { - if (data->release != nullptr) { - data->release(data); - } -} - -template <> -inline void init_pointer(struct ArrowArray* data) { - data->release = nullptr; -} - -template <> -inline void move_pointer(struct ArrowArray* src, struct ArrowArray* dst) { - ArrowArrayMove(src, dst); -} - -template <> -inline void release_pointer(struct ArrowArray* data) { - if (data->release != nullptr) { - data->release(data); - } -} - -template <> -inline void init_pointer(struct ArrowArrayStream* data) { - data->release = nullptr; -} - -template <> -inline void move_pointer(struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { - ArrowArrayStreamMove(src, dst); -} - -template <> -inline void release_pointer(ArrowArrayStream* data) { - if (data->release != nullptr) { - data->release(data); - } -} - -template <> -inline void init_pointer(struct ArrowBuffer* data) { - ArrowBufferInit(data); -} - -template <> -inline void move_pointer(struct ArrowBuffer* src, struct ArrowBuffer* dst) { - ArrowBufferMove(src, dst); -} - -template <> -inline void release_pointer(struct ArrowBuffer* data) { - ArrowBufferReset(data); -} - -template <> -inline void init_pointer(struct ArrowBitmap* data) { - ArrowBitmapInit(data); -} - -template <> -inline void move_pointer(struct ArrowBitmap* src, struct ArrowBitmap* dst) { - ArrowBitmapMove(src, dst); -} - -template <> -inline void release_pointer(struct ArrowBitmap* data) { - ArrowBitmapReset(data); -} - -template <> -inline void init_pointer(struct ArrowArrayView* data) { - ArrowArrayViewInitFromType(data, NANOARROW_TYPE_UNINITIALIZED); -} - -template <> -inline void move_pointer(struct ArrowArrayView* src, struct ArrowArrayView* dst) { - ArrowArrayViewMove(src, dst); -} - -template <> -inline void release_pointer(struct ArrowArrayView* data) { - ArrowArrayViewReset(data); -} - -/// \brief A unique_ptr-like base class for stack-allocatable objects -/// \tparam T The object type -template -class Unique { - public: - /// \brief Construct an invalid instance of T holding no resources - Unique() { init_pointer(&data_); } - - /// \brief Move and take ownership of data - Unique(T* data) { move_pointer(data, &data_); } - - /// \brief Move and take ownership of data wrapped by rhs - Unique(Unique&& rhs) : Unique(rhs.get()) {} - Unique& operator=(Unique&& rhs) { - reset(rhs.get()); - return *this; - } - - // These objects are not copyable - Unique(const Unique& rhs) = delete; - - /// \brief Get a pointer to the data owned by this object - T* get() noexcept { return &data_; } - const T* get() const noexcept { return &data_; } - - /// \brief Use the pointer operator to access fields of this object - T* operator->() noexcept { return &data_; } - const T* operator->() const noexcept { return &data_; } - - /// \brief Call data's release callback if valid - void reset() { release_pointer(&data_); } - - /// \brief Call data's release callback if valid and move ownership of the data - /// pointed to by data - void reset(T* data) { - reset(); - move_pointer(data, &data_); - } - - /// \brief Move ownership of this object to the data pointed to by out - void move(T* out) { move_pointer(&data_, out); } - - ~Unique() { reset(); } - - protected: - T data_; -}; - -/// @} - -} // namespace internal - -/// \defgroup nanoarrow_hpp-unique Unique object wrappers -/// -/// The Arrow C Data interface, the Arrow C Stream interface, and the -/// nanoarrow C library use stack-allocatable objects, some of which -/// require initialization or cleanup. -/// -/// @{ - -/// \brief Class wrapping a unique struct ArrowSchema -using UniqueSchema = internal::Unique; - -/// \brief Class wrapping a unique struct ArrowArray -using UniqueArray = internal::Unique; - -/// \brief Class wrapping a unique struct ArrowArrayStream -using UniqueArrayStream = internal::Unique; - -/// \brief Class wrapping a unique struct ArrowBuffer -using UniqueBuffer = internal::Unique; - -/// \brief Class wrapping a unique struct ArrowBitmap -using UniqueBitmap = internal::Unique; - -/// \brief Class wrapping a unique struct ArrowArrayView -using UniqueArrayView = internal::Unique; - -/// @} - -/// \defgroup nanoarrow_hpp-array-stream ArrayStream helpers -/// -/// These classes provide simple ArrowArrayStream implementations that -/// can be extended to help simplify the process of creating a valid -/// ArrowArrayStream implementation or used as-is for testing. -/// -/// @{ - -/// @brief Export an ArrowArrayStream from a standard C++ class -/// @tparam T A class with methods `int GetSchema(ArrowSchema*)`, `int -/// GetNext(ArrowArray*)`, and `const char* GetLastError()` -/// -/// This class allows a standard C++ class to be exported to a generic ArrowArrayStream -/// consumer by mapping C callback invocations to method calls on an instance of the -/// object whose lifecycle is owned by the ArrowArrayStream. See VectorArrayStream for -/// minimal useful example of this pattern. -/// -/// The methods must be accessible to the ArrayStreamFactory, either as public methods or -/// by declaring ArrayStreamFactory a friend. Implementors are encouraged (but -/// not required) to implement a ToArrayStream(ArrowArrayStream*) that creates a new -/// instance owned by the ArrowArrayStream and moves the relevant data to that instance. -/// -/// An example implementation might be: -/// -/// \code -/// class StreamImpl { -/// public: -/// // Public methods (e.g., constructor) used from C++ to initialize relevant data -/// -/// // Idiomatic exporter to move data + lifecycle responsibility to an instance -/// // managed by the ArrowArrayStream callbacks -/// void ToArrayStream(struct ArrowArrayStream* out) { -/// ArrayStreamFactory::InitArrayStream(new StreamImpl(...), out); -/// } -/// -/// private: -/// // Make relevant methods available to the ArrayStreamFactory -/// friend class ArrayStreamFactory; -/// -/// // Method implementations (called from C, not normally interacted with from C++) -/// int GetSchema(struct ArrowSchema* schema) { return ENOTSUP; } -/// int GetNext(struct ArrowArray* array) { return ENOTSUP; } -/// const char* GetLastError() { nullptr; } -/// }; -/// \endcode -/// -/// An example usage might be: -/// -/// \code -/// // Call constructor and/or public methods to initialize relevant data -/// StreamImpl impl; -/// -/// // Export to ArrowArrayStream after data are finalized -/// UniqueArrayStream stream; -/// impl.ToArrayStream(stream.get()); -/// \endcode -template -class ArrayStreamFactory { - public: - /// \brief Take ownership of instance and populate callbacks of out - static void InitArrayStream(T* instance, struct ArrowArrayStream* out) { - out->get_schema = &get_schema_wrapper; - out->get_next = &get_next_wrapper; - out->get_last_error = &get_last_error_wrapper; - out->release = &release_wrapper; - out->private_data = instance; - } - - private: - static int get_schema_wrapper(struct ArrowArrayStream* stream, - struct ArrowSchema* schema) { - return reinterpret_cast(stream->private_data)->GetSchema(schema); - } - - static int get_next_wrapper(struct ArrowArrayStream* stream, struct ArrowArray* array) { - return reinterpret_cast(stream->private_data)->GetNext(array); - } - - static const char* get_last_error_wrapper(struct ArrowArrayStream* stream) { - return reinterpret_cast(stream->private_data)->GetLastError(); - } - - static void release_wrapper(struct ArrowArrayStream* stream) { - delete reinterpret_cast(stream->private_data); - stream->release = nullptr; - stream->private_data = nullptr; - } -}; - -/// \brief An empty array stream -/// -/// This class can be constructed from an struct ArrowSchema and implements a default -/// get_next() method that always marks the output ArrowArray as released. -/// -/// DEPRECATED (0.4.0): Early versions of nanoarrow allowed subclasses to override -/// get_schema(), get_next(), and get_last_error(). This functionality will be removed -/// in a future release: use the pattern documented in ArrayStreamFactory to create -/// custom ArrowArrayStream implementations. -class EmptyArrayStream { - public: - /// \brief Create an EmptyArrayStream from an ArrowSchema - /// - /// Takes ownership of schema. - EmptyArrayStream(struct ArrowSchema* schema) : schema_(schema) { - ArrowErrorInit(&error_); - } - - /// \brief Export to ArrowArrayStream - void ToArrayStream(struct ArrowArrayStream* out) { - EmptyArrayStream* impl = new EmptyArrayStream(schema_.get()); - ArrayStreamFactory::InitArrayStream(impl, out); - } - - /// \brief Create an empty UniqueArrayStream from a struct ArrowSchema - /// - /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export an - /// EmptyArrayStream to an ArrowArrayStream consumer. - static UniqueArrayStream MakeUnique(struct ArrowSchema* schema) { - UniqueArrayStream stream; - EmptyArrayStream(schema).ToArrayStream(stream.get()); - return stream; - } - - virtual ~EmptyArrayStream() {} - - protected: - UniqueSchema schema_; - struct ArrowError error_; - - void MakeStream(struct ArrowArrayStream* stream) { ToArrayStream(stream); } - - virtual int get_schema(struct ArrowSchema* schema) { - return ArrowSchemaDeepCopy(schema_.get(), schema); - } - - virtual int get_next(struct ArrowArray* array) { - array->release = nullptr; - return NANOARROW_OK; - } - - virtual const char* get_last_error() { return error_.message; } - - private: - friend class ArrayStreamFactory; - - int GetSchema(struct ArrowSchema* schema) { return get_schema(schema); } - - int GetNext(struct ArrowArray* array) { return get_next(array); } - - const char* GetLastError() { return get_last_error(); } -}; - -/// \brief Implementation of an ArrowArrayStream backed by a vector of UniqueArray objects -class VectorArrayStream { - public: - /// \brief Create a VectorArrayStream from an ArrowSchema + vector of UniqueArray - /// - /// Takes ownership of schema and moves arrays if possible. - VectorArrayStream(struct ArrowSchema* schema, std::vector arrays) - : offset_(0), schema_(schema), arrays_(std::move(arrays)) {} - - /// \brief Create a one-shot VectorArrayStream from an ArrowSchema + ArrowArray - /// - /// Takes ownership of schema and array. - VectorArrayStream(struct ArrowSchema* schema, struct ArrowArray* array) - : offset_(0), schema_(schema) { - arrays_.emplace_back(array); - } - - /// \brief Export to ArrowArrayStream - void ToArrayStream(struct ArrowArrayStream* out) { - VectorArrayStream* impl = new VectorArrayStream(schema_.get(), std::move(arrays_)); - ArrayStreamFactory::InitArrayStream(impl, out); - } - - /// \brief Create a UniqueArrowArrayStream from an existing array - /// - /// DEPRECATED (0.4.0): Use the constructors + ToArrayStream() to export a - /// VectorArrayStream to an ArrowArrayStream consumer. - static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, - struct ArrowArray* array) { - UniqueArrayStream stream; - VectorArrayStream(schema, array).ToArrayStream(stream.get()); - return stream; - } - - /// \brief Create a UniqueArrowArrayStream from existing arrays - /// - /// DEPRECATED (0.4.0): Use the constructor + ToArrayStream() to export a - /// VectorArrayStream to an ArrowArrayStream consumer. - static UniqueArrayStream MakeUnique(struct ArrowSchema* schema, - std::vector arrays) { - UniqueArrayStream stream; - VectorArrayStream(schema, std::move(arrays)).ToArrayStream(stream.get()); - return stream; - } - - private: - int64_t offset_; - UniqueSchema schema_; - std::vector arrays_; - - friend class ArrayStreamFactory; - - int GetSchema(struct ArrowSchema* schema) { - return ArrowSchemaDeepCopy(schema_.get(), schema); - } - - int GetNext(struct ArrowArray* array) { - if (offset_ < static_cast(arrays_.size())) { - arrays_[offset_++].move(array); - } else { - array->release = nullptr; - } - - return NANOARROW_OK; - } - - const char* GetLastError() { return ""; } -}; - -/// @} - -} // namespace nanoarrow - -#endif diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index e8a8f949a9..4a82c16b86 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -1,7 +1,7 @@ -#include // for R interface to C++ -#include // for C interface to Arrow -#include // for C/C++ interface to Arrow -#include // for fromInteger64 +#include // for R interface to C++ +#include // for C interface to Arrow (via R package) +#include // for C/C++ interface to Arrow +#include // for fromInteger64 // we currently get deprecation warnings by default which are noisy #ifndef TILEDB_NO_API_DEPRECATION_WARNINGS diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index b9b65621bb..5dcc38ad51 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -5,9 +5,9 @@ //#define RCPP_DEBUG_LEVEL 5 -#include // for R interface to C++ -#include // for C interface to Arrow -#include +#include // for R interface to C++ +#include // for C interface to Arrow (via R package nanoarrow) +#include #include #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 4 diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp index 61fe84f1e3..e986eef836 100644 --- a/apis/r/src/rutilities.cpp +++ b/apis/r/src/rutilities.cpp @@ -4,9 +4,9 @@ #define TILEDB_NO_API_DEPRECATION_WARNINGS #endif -#include // for R interface to C++ -#include // for C interface to Arrow -#include // for fromInteger64 +#include // for R interface to C++ +#include // for C interface to Arrow +#include // for fromInteger64 // We get these via nanoarrow and must cannot include carrow.h again #define ARROW_SCHEMA_AND_ARRAY_DEFINED 1 From 8a6929dbe0280dad01556a3dc8a44ee6c80000e4 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 16:19:28 -0500 Subject: [PATCH 36/39] Re-activate -Werror --- libtiledbsoma/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libtiledbsoma/CMakeLists.txt b/libtiledbsoma/CMakeLists.txt index a1a0de14a6..a055d71cd2 100644 --- a/libtiledbsoma/CMakeLists.txt +++ b/libtiledbsoma/CMakeLists.txt @@ -187,6 +187,10 @@ if(MSVC) else() add_compile_options(-Wall -Wextra) + if(TILEDBSOMA_ENABLE_WERROR) + add_compile_options(-Werror) + endif() + # Build-specific flags if(CMAKE_BUILD_TYPE MATCHES "Debug") add_compile_options(-DDEBUG -O0 -g3 -ggdb3 -gdwarf-3) From e60a6c3fb05182387a41c6f1a1f861fa100b7a8f Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 16:20:38 -0500 Subject: [PATCH 37/39] Chore --- .github/workflows/r-ci.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/r-ci.yml b/.github/workflows/r-ci.yml index 88c26eb811..7d4b9b4878 100644 --- a/.github/workflows/r-ci.yml +++ b/.github/workflows/r-ci.yml @@ -79,14 +79,6 @@ jobs: # if: ${{ matrix.os != 'macOS-latest' }} # run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - #- name: Install r-universe build of SeuratObject (macOS) - # if: ${{ matrix.os == 'macOS-latest' }} - # run: cd apis/r && Rscript -e "install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev', 'https://cloud.r-project.org'))" - - #- name: Install r-universe build of SeuratObject (linux) - # if: ${{ matrix.os == 'ubuntu-latest' }} - # run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('SeuratObject', repos = c('https://mojaveazure.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))" - - name: Dependencies run: cd apis/r && tools/r-ci.sh install_all From 5450dfef7f0942e48514954ec46fd36f73ee0f5c Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 16:38:41 -0500 Subject: [PATCH 38/39] High-productivity afternoon --- apis/r/src/RcppExports.cpp | 6 +++--- apis/r/src/rinterface.cpp | 18 +++++++++--------- apis/r/src/riterator.cpp | 4 ++-- apis/r/src/rutilities.h | 4 ---- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 0d3edad527..84cdc2ac19 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -12,7 +12,7 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // soma_array_reader -nanoarrowXPtr soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); +SEXP soma_array_reader(const std::string& uri, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, const std::string& loglevel, Rcpp::Nullable config); RcppExport SEXP _tiledbsoma_soma_array_reader(SEXP uriSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP loglevelSEXP, SEXP configSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -130,7 +130,7 @@ BEGIN_RCPP END_RCPP } // create_empty_arrow_table -nanoarrowXPtr create_empty_arrow_table(); +SEXP create_empty_arrow_table(); RcppExport SEXP _tiledbsoma_create_empty_arrow_table() { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; @@ -140,7 +140,7 @@ BEGIN_RCPP END_RCPP } // sr_next -nanoarrowXPtr sr_next(Rcpp::XPtr sr); +SEXP sr_next(Rcpp::XPtr sr); RcppExport SEXP _tiledbsoma_sr_next(SEXP srSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 4a82c16b86..4c58f7012b 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -49,15 +49,15 @@ Rcpp::XPtr array_setup_struct(Rcpp::XPtr arrxp, int64_t //' @noRd // [[Rcpp::export(soma_array_reader_impl)]] -nanoarrowXPtr soma_array_reader(const std::string& uri, - Rcpp::Nullable colnames = R_NilValue, - Rcpp::Nullable> qc = R_NilValue, - Rcpp::Nullable dim_points = R_NilValue, - Rcpp::Nullable dim_ranges = R_NilValue, - std::string batch_size = "auto", - std::string result_order = "auto", - const std::string& loglevel = "auto", - Rcpp::Nullable config = R_NilValue) { +SEXP soma_array_reader(const std::string& uri, + Rcpp::Nullable colnames = R_NilValue, + Rcpp::Nullable> qc = R_NilValue, + Rcpp::Nullable dim_points = R_NilValue, + Rcpp::Nullable dim_ranges = R_NilValue, + std::string batch_size = "auto", + std::string result_order = "auto", + const std::string& loglevel = "auto", + Rcpp::Nullable config = R_NilValue) { if (loglevel != "auto") { spdl::set_level(loglevel); diff --git a/apis/r/src/riterator.cpp b/apis/r/src/riterator.cpp index 5dcc38ad51..7521feb56a 100644 --- a/apis/r/src/riterator.cpp +++ b/apis/r/src/riterator.cpp @@ -168,7 +168,7 @@ bool sr_complete(Rcpp::XPtr sr) { //' @noRd //' @import nanoarrow // [[Rcpp::export]] -nanoarrowXPtr create_empty_arrow_table() { +SEXP create_empty_arrow_table() { int ncol = 0; // Schema first @@ -193,7 +193,7 @@ nanoarrowXPtr create_empty_arrow_table() { // [[Rcpp::export]] -nanoarrowXPtr sr_next(Rcpp::XPtr sr) { +SEXP sr_next(Rcpp::XPtr sr) { check_xptr_tag(sr); if (sr_complete(sr)) { diff --git a/apis/r/src/rutilities.h b/apis/r/src/rutilities.h index 7a5acbf586..f40ee755af 100644 --- a/apis/r/src/rutilities.h +++ b/apis/r/src/rutilities.h @@ -63,10 +63,6 @@ struct ContextWrapper { }; typedef struct ContextWrapper ctx_wrap_t; -// some lipstick on the (plain C language) pig that is a SEXP: -// allowing the nanoarrow ArrowArray XPtr be typedef'ed -typedef SEXP nanoarrowXPtr; - inline void exitIfError(const ArrowErrorCode ec, const std::string& msg) { if (ec != NANOARROW_OK) Rcpp::stop(msg); } From 7bb42fbe0ddb8d1977d237ec422d0644517f73a4 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 2 Apr 2024 18:39:32 -0500 Subject: [PATCH 39/39] Correct an format string error message --- libtiledbsoma/src/utils/arrow_adapter.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 7ecfc6dca8..66a87d863e 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -583,8 +583,8 @@ enum ArrowType ArrowAdapter::to_nanoarrow_type(std::string_view sv) { else if (sv == "Z") return NANOARROW_TYPE_LARGE_BINARY; else - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype string: {} ", sv)); + throw TileDBSOMAError( + fmt::format("ArrowAdapter: Unsupported Arrow format: {} ", sv)); } } // namespace tiledbsoma