Skip to content

Commit

Permalink
WIP fix several errors for macos
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenv committed Apr 5, 2024
1 parent fc35aba commit 6637b82
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 122 deletions.
2 changes: 1 addition & 1 deletion apis/python/devtools/ingestor
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import os
import sys
from typing import Optional

import tiledb
from somacore import options

import tiledbsoma
Expand All @@ -26,6 +25,7 @@ import tiledbsoma._util
import tiledbsoma.io
import tiledbsoma.logging
from tiledbsoma.options import SOMATileDBContext
import tiledb


# ================================================================
Expand Down
119 changes: 2 additions & 117 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import pandas as pd
import pyarrow as pa
import somacore
import tiledb
from somacore import options
from typing_extensions import Self

Expand Down Expand Up @@ -506,7 +505,7 @@ def write(
return self

def _set_reader_coord(
self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object
self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: object
) -> bool:
if coord is None:
return True # No constraint; select all in this dimension
Expand Down Expand Up @@ -580,7 +579,7 @@ def _set_reader_coord_by_py_seq_or_np_array(
self,
sr: clib.SOMAArray,
dim_idx: int,
dim: tiledb.Dim,
dim: pa.Field,
coord: object,
) -> bool:
if isinstance(coord, np.ndarray):
Expand Down Expand Up @@ -705,120 +704,6 @@ def _canonicalize_schema(
return schema


def _build_tiledb_schema(
schema: pa.Schema,
index_column_names: Sequence[str],
domain: Optional[Sequence[Optional[Tuple[Any, Any]]]],
tiledb_create_options: TileDBCreateOptions,
context: SOMATileDBContext,
) -> tiledb.ArraySchema:
"""Converts an Arrow schema into a TileDB ArraySchema for creation."""

if domain is None:
domain = tuple(None for _ in index_column_names)
else:
ndom = len(domain)
nidx = len(index_column_names)
if ndom != nidx:
raise ValueError(
f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}"
)

dims = []
for index_column_name, slot_domain in zip(index_column_names, domain):
pa_type = schema.field(index_column_name).type
dtype = _arrow_types.tiledb_type_from_arrow_type(
pa_type, is_indexed_column=True
)

slot_domain = _fill_out_slot_domain(
slot_domain, index_column_name, pa_type, dtype
)

extent = _find_extent_for_domain(
index_column_name, tiledb_create_options, dtype, slot_domain
)

dim = tiledb.Dim(
name=index_column_name,
domain=slot_domain,
tile=extent,
dtype=dtype,
filters=tiledb_create_options.dim_filters_tiledb(
index_column_name,
[
dict(
_type="ZstdFilter",
level=tiledb_create_options.dataframe_dim_zstd_level,
)
],
),
)
dims.append(dim)

dom = tiledb.Domain(dims, ctx=context.tiledb_ctx)

attrs = []
enums = []
metadata = schema.metadata or {}
for pa_attr in schema:
attr_name = pa_attr.name

if attr_name in index_column_names:
continue

has_enum = pa.types.is_dictionary(pa_attr.type)

if has_enum:
enmr_dtype: np.dtype[Any]
vtype = pa_attr.type.value_type
if pa.types.is_large_string(vtype) or pa.types.is_string(vtype):
enmr_dtype = np.dtype("U")
elif pa.types.is_large_binary(vtype) or pa.types.is_binary(vtype):
enmr_dtype = np.dtype("S")
else:
enmr_dtype = np.dtype(vtype.to_pandas_dtype())
enums.append(
tiledb.Enumeration(
name=attr_name,
ordered=pa_attr.type.ordered,
dtype=enmr_dtype,
)
)

attr = tiledb.Attr(
name=attr_name,
dtype=_arrow_types.tiledb_type_from_arrow_type(
schema.field(attr_name).type
),
nullable=metadata.get(attr_name.encode("utf-8")) == b"nullable",
filters=tiledb_create_options.attr_filters_tiledb(
attr_name, ["ZstdFilter"]
),
enum_label=attr_name if has_enum else None,
ctx=context.tiledb_ctx,
)
attrs.append(attr)

cell_order, tile_order = tiledb_create_options.cell_tile_orders()

return tiledb.ArraySchema(
domain=dom,
attrs=attrs,
enums=enums,
sparse=True,
allows_duplicates=tiledb_create_options.allows_duplicates,
offsets_filters=tiledb_create_options.offsets_filters_tiledb(),
validity_filters=tiledb_create_options.validity_filters_tiledb(),
capacity=tiledb_create_options.capacity,
cell_order=cell_order,
# As of TileDB core 2.8.2, we cannot consolidate string-indexed sparse arrays with
# col-major tile order: so we write ``X`` with row-major tile order.
tile_order=tile_order,
ctx=context.tiledb_ctx,
)


def _fill_out_slot_domain(
slot_domain: Optional[Tuple[Any, Any]],
index_column_name: str,
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,7 +1361,7 @@ def test_enum_extend_past_numerical_limit(tmp_path):

# cannot add additional categories as already maxed out earlier
tbl = pa.Table.from_pandas(df2, preserve_index=False)
with pytest.raises(soma.SOMAError):
with pytest.raises((RuntimeError, soma.SOMAError)):
with soma.open(uri, mode="w") as A:
A.write(tbl)

Expand Down
6 changes: 3 additions & 3 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -505,8 +505,8 @@ ArrowTable ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
}

bool ArrowAdapter::_isvar(const char* format) {
if ((strcmp(format, "U") == 0) | (strcmp(format, "Z") == 0) |
(strcmp(format, "u") == 0) | (strcmp(format, "z") == 0)) {
if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
return true;
}
return false;
Expand Down Expand Up @@ -560,4 +560,4 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) {
}
}

} // namespace tiledbsoma
} // namespace tiledbsoma

0 comments on commit 6637b82

Please sign in to comment.