From fbeee6f4c7484ad48eaa2b94d02f7fdb3103a46b Mon Sep 17 00:00:00 2001
From: Bruce Martin <bruce.martin@tiledb.com>
Date: Thu, 2 Jan 2025 07:48:23 -0800
Subject: [PATCH] validate byteorder on argument arrays (#3508)

---
 apis/python/src/tiledbsoma/fastercsx.cc | 39 +++++++++++-
 apis/python/tests/test_libfastercsx.py  | 79 +++++++++++++++++++++++++
 2 files changed, 117 insertions(+), 1 deletion(-)
diff --git a/apis/python/src/tiledbsoma/fastercsx.cc b/apis/python/src/tiledbsoma/fastercsx.cc
index 9eb5a9ced4..d82a20bef6 100644
--- a/apis/python/src/tiledbsoma/fastercsx.cc
+++ b/apis/python/src/tiledbsoma/fastercsx.cc
@@ -29,7 +29,7 @@
  *
  * Python bindings for CSX conversion primitives.
  */
-
+#include <bit>
 #include <variant>
 
 // Define to include extra debugging bindings (e.g., count_rows)
@@ -108,6 +108,31 @@ std::span<R> make_mutable_casted_span_(py::array arr) {
     return std::span<R>(reinterpret_cast<R*>(p), arr.size());
 }
 
+/**
+ * @brief Return true if the NP byteorder is native (or equivalent).
+ */
+bool is_native_byteorder(const char byteorder) {
+    if (byteorder == '=')  // native
+        return true;
+    if (byteorder == '|')  // not-applicable
+        return true;
+    if constexpr (std::endian::native == std::endian::big)
+        return byteorder == '>';  // big
+    else
+        return byteorder == '<';  // little
+}
+
+/**
+ * @brief Check enddianness/byteorder is native, and raise exception if not.
+ * Necessary becuase we dispatch on dtype().num(), which doesn't confirm
+ * byteorder is native.
+ */
+void check_byteorder(const py::dtype& dtype) {
+    if (!is_native_byteorder(dtype.byteorder()))
+        throw invalid_argument(
+            "All arrays must have native byteorder (endianness).");
+}
+
 /*
  * Value/data arrays are cast to an unsigned of the same width as the actual
  * value type. This is solely to reduce the combinatorics of template
@@ -209,6 +234,7 @@ T lookup_dtype_(
     const std::unordered_map<int, T>& index,
     const py::dtype& dtype,
     const std::string& array_name) {
+    check_byteorder(dtype);
     try {
         return index.at(dtype.num());
     } catch (const std::out_of_range& oor) {
@@ -241,6 +267,7 @@ void compress_coo_validate_args_(
     5. ensure B* are writeable
     6. Ensure each element in A* tuples are same type
     7. Ensure each element in the A* tuples are the same length
+    8. byteorder
     etc...
 
     Not checked:
@@ -261,6 +288,7 @@ void compress_coo_validate_args_(
             if (arr.dtype().num() != vec[0].dtype().num())
                 throw pybind11::type_error(
                     "All chunks of COO arrays must be of same type.");
+            check_byteorder(arr.dtype());
         }
     }
     for (uint64_t chunk_idx = 0; chunk_idx < n_chunks; chunk_idx++) {
@@ -269,9 +297,14 @@ void compress_coo_validate_args_(
             throw std::length_error(
                 "All COO array tuple elements must be of the same size.");
     }
+
     if (Bp.ndim() != 1 || Bj.ndim() != 1 || Bd.ndim() != 1)
         throw std::length_error("All arrays must be of dimension rank 1.");
 
+    check_byteorder(Bp.dtype());
+    check_byteorder(Bj.dtype());
+    check_byteorder(Bd.dtype());
+
     for (auto& arr : Ad)
         if (arr.dtype().num() != Bd.dtype().num())
             throw pybind11::type_error("All data arrays must be of same type.");
@@ -408,6 +441,10 @@ bool sort_csx_indices(
     if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable())
         throw std::invalid_argument("Output arrays must be writeable.");
 
+    check_byteorder(Bp.dtype());
+    check_byteorder(Bj.dtype());
+    check_byteorder(Bd.dtype());
+
     // Get dispatch types
     CsxIndexType csx_major_index_type = lookup_dtype_(
         csx_index_type_dispatch, Bp.dtype(), "CSx indptr array");
diff --git a/apis/python/tests/test_libfastercsx.py b/apis/python/tests/test_libfastercsx.py
index 6e949a9916..461e6bec7b 100644
--- a/apis/python/tests/test_libfastercsx.py
+++ b/apis/python/tests/test_libfastercsx.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import sys
 from typing import Any
 
 import numpy as np
@@ -11,6 +12,8 @@
 import tiledbsoma.pytiledbsoma as clib
 import tiledbsoma.pytiledbsoma.fastercsx as fastercsx
 
+NON_NATIVE_BYTEORDER = ">" if sys.byteorder == "little" else "<"
+
 
 @pytest.fixture
 def concurrency() -> int | None:
@@ -243,6 +246,14 @@ def test_sort_csx_indices_bad_args(
         pbad[1] = -1
         fastercsx.sort_csx_indices(context, pbad, j, d)
 
+    # non-native byteorder should throw
+    with pytest.raises(ValueError):
+        fastercsx.sort_csx_indices(context, p.astype(f"{NON_NATIVE_BYTEORDER}i4"), j, d)
+    with pytest.raises(ValueError):
+        fastercsx.sort_csx_indices(context, p, j.astype(f"{NON_NATIVE_BYTEORDER}i4"), d)
+    with pytest.raises(ValueError):
+        fastercsx.sort_csx_indices(context, p, j, d.astype(f"{NON_NATIVE_BYTEORDER}i4"))
+
 
 def test_compress_coo_bad_args(
     rng: np.random.Generator, context: clib.SOMAContext
@@ -312,6 +323,74 @@ def test_compress_coo_bad_args(
             context, sp.shape, (i,), (j,), (d[1:],), indptr, indices, data
         )
 
+    # non-native byteorder should throw
+    with pytest.raises(ValueError):
+        fastercsx.compress_coo(
+            context,
+            sp.shape,
+            (i.astype(f"{NON_NATIVE_BYTEORDER}i4"),),
+            (j,),
+            (d,),
+            indptr,
+            indices,
+            data,
+        )
+    with pytest.raises(ValueError):
+        fastercsx.compress_coo(
+            context,
+            sp.shape,
+            (i,),
+            (j.astype(f"{NON_NATIVE_BYTEORDER}i4"),),
+            (d,),
+            indptr,
+            indices,
+            data,
+        )
+    with pytest.raises(ValueError):
+        fastercsx.compress_coo(
+            context,
+            sp.shape,
+            (i,),
+            (j,),
+            (d.astype(f"{NON_NATIVE_BYTEORDER}i4"),),
+            indptr,
+            indices,
+            data,
+        )
+    with pytest.raises(ValueError):
+        fastercsx.compress_coo(
+            context,
+            sp.shape,
+            (i,),
+            (j,),
+            (d,),
+            indptr.astype(f"{NON_NATIVE_BYTEORDER}i4"),
+            indices,
+            data,
+        )
+    with pytest.raises(ValueError):
+        fastercsx.compress_coo(
+            context,
+            sp.shape,
+            (i,),
+            (j,),
+            (d,),
+            indptr,
+            indices.astype(f"{NON_NATIVE_BYTEORDER}i4"),
+            data,
+        )
+    with pytest.raises(ValueError):
+        fastercsx.compress_coo(
+            context,
+            sp.shape,
+            (i,),
+            (j,),
+            (d,),
+            indptr,
+            indices,
+            data.astype(f"{NON_NATIVE_BYTEORDER}i4"),
+        )
+
 
 def test_ragged_chunk_error(
     rng: np.random.Generator, context: clib.SOMAContext