diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 385d3f0d8f..e05d34ec41 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: mypy additional_dependencies: - "pandas-stubs==1.5.3.230214" - - "somacore==1.0.9" + - "somacore==1.0.10" - "types-setuptools==67.4.0.3" args: ["--config-file=apis/python/pyproject.toml", "apis/python/src", "apis/python/devtools"] pass_filenames: false diff --git a/apis/python/setup.py b/apis/python/setup.py index ef2fccc914..1a7eb41294 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -327,7 +327,7 @@ def run(self): "pyarrow>=9.0.0; platform_system!='Darwin'", "scanpy>=1.9.2", "scipy", - "somacore==1.0.9", + "somacore==1.0.10", "tiledb~=0.26.0", "typing-extensions", # Note "-" even though `import typing_extensions` ], diff --git a/apis/python/src/tiledbsoma/__init__.py b/apis/python/src/tiledbsoma/__init__.py index 0b375edd83..6e11dea753 100644 --- a/apis/python/src/tiledbsoma/__init__.py +++ b/apis/python/src/tiledbsoma/__init__.py @@ -157,7 +157,7 @@ get_storage_engine, show_package_versions, ) -from ._index_util import tiledbsoma_build_index +from ._indexer import IntIndexer from ._measurement import Measurement from ._sparse_nd_array import SparseNDArray from .options import SOMATileDBContext, TileDBCreateOptions @@ -173,7 +173,6 @@ __all__ = [ "AxisColumnNames", "AxisQuery", - "tiledbsoma_build_index", "Collection", "DataFrame", "DenseNDArray", @@ -184,6 +183,7 @@ "get_implementation", "get_SOMA_version", "get_storage_engine", + "IntIndexer", "Measurement", "open", "ResultOrder", diff --git a/apis/python/src/tiledbsoma/_experiment.py b/apis/python/src/tiledbsoma/_experiment.py index 3942dfb4d3..07f9ce0346 100644 --- a/apis/python/src/tiledbsoma/_experiment.py +++ b/apis/python/src/tiledbsoma/_experiment.py @@ -13,7 +13,7 @@ from ._collection import Collection, CollectionBase from ._dataframe import DataFrame -from ._index_util import tiledbsoma_build_index +from ._indexer import IntIndexer from ._measurement import Measurement from ._tdb_handles import Wrapper from ._tiledb_object import AnyTileDBObject @@ -95,7 +95,7 @@ def axis_query( # type: ignore obs_query=obs_query or query.AxisQuery(), var_query=var_query or query.AxisQuery(), index_factory=functools.partial( - tiledbsoma_build_index, + IntIndexer, context=self.context, ), ) diff --git a/apis/python/src/tiledbsoma/_index_util.py b/apis/python/src/tiledbsoma/_index_util.py deleted file mode 100644 index 74c9be327f..0000000000 --- a/apis/python/src/tiledbsoma/_index_util.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -This file is separate from _util.py, due to a circular-import issue with -SOMATileDBContext which would otherwise ensue. -""" -from __future__ import annotations - -from typing import TYPE_CHECKING, Optional, Union - -import numpy as np -import pandas as pd -import pyarrow as pa -from somacore.query.types import IndexLike - -from tiledbsoma import pytiledbsoma as clib - -if TYPE_CHECKING: - from .options import SOMATileDBContext - - -def tiledbsoma_build_index( - keys: Union[ # type: ignore[type-arg] - np.typing.NDArray[np.int64], - pa.Array, - pa.IntegerArray, - pd.Series, - pd.arrays.IntegerArray, - pa.ChunkedArray, - list[int], - ], - *, - context: Optional["SOMATileDBContext"] = None, - thread_count: int = 4, -) -> IndexLike: - """ - Returns an ``IndexLike`` re-indexer. - The reindexer has an API similar to :meth:`pd.Index.get_indexer` - - Args: - keys: - Integer keys used to build the index (hash) table. - context: - ``SOMATileDBContext`` object containing concurrecy level (exclusive with thread_count). - thread_count: - Concurrency level when the user does not want to use ``context``. - - Lifecycle: - Experimental. - """ - native_context = None if context is None else context.native_context - reindexer = clib.IntIndexer(native_context) - reindexer.map_locations(keys) - return reindexer # type: ignore[no-any-return] diff --git a/apis/python/src/tiledbsoma/_indexer.py b/apis/python/src/tiledbsoma/_indexer.py new file mode 100644 index 0000000000..f2e904d999 --- /dev/null +++ b/apis/python/src/tiledbsoma/_indexer.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, List, Optional, Union + +import numpy as np +import numpy.typing as npt +import pandas as pd +import pyarrow as pa + +from tiledbsoma import pytiledbsoma as clib + +if TYPE_CHECKING: + from .options import SOMATileDBContext + +IndexerDataType = Union[ + npt.NDArray[np.int64], + pa.Array, + pa.IntegerArray, + pd.Series, + pd.arrays.IntegerArray, + pa.ChunkedArray, + List[int], +] + + +class IntIndexer: + """A re-indexer for unique integer indices. + + Lifecycle: + Experimental. + """ + + def __init__( + self, data: IndexerDataType, *, context: Optional["SOMATileDBContext"] = None + ): + """Initialize re-indexer for provied indices. + + Args: + data: + Integer keys used to build the index (hash) table. + context: + ``SOMATileDBContext`` object containing concurrecy level. + + Lifecycle: + Experimental. + """ + self._context = context + self._reindexer = clib.IntIndexer( + None if self._context is None else self._context.native_context + ) + self._reindexer.map_locations(data) + + def get_indexer(self, target: IndexerDataType) -> Any: + """Compute underlying indices of index for target data. + + Compatible with Pandas' Index.get_indexer method. + + Args: + target: Data to return re-index data for. + """ + return ( + self._reindexer.get_indexer_pyarrow(target) + if isinstance(target, (pa.Array, pa.ChunkedArray)) + else self._reindexer.get_indexer_general(target) + ) diff --git a/apis/python/src/tiledbsoma/_read_iters.py b/apis/python/src/tiledbsoma/_read_iters.py index a0c7368d7d..878d14adf0 100644 --- a/apis/python/src/tiledbsoma/_read_iters.py +++ b/apis/python/src/tiledbsoma/_read_iters.py @@ -36,7 +36,7 @@ from . import _util from ._exception import SOMAError -from ._index_util import tiledbsoma_build_index +from ._indexer import IntIndexer from ._types import NTuple from .options import SOMATileDBContext @@ -138,10 +138,7 @@ def __init__( self.axes_to_reindex = set(range(self.ndim)) - set(self.reindex_disable_on_axis) assert context is not None self.minor_axes_indexer = { - d: tiledbsoma_build_index( - self.joinids[d].to_numpy(), - context=context, - ) + d: IntIndexer(self.joinids[d].to_numpy(), context=context) for d in (self.axes_to_reindex - set((self.major_axis,))) } @@ -257,9 +254,8 @@ def _reindexed_table_reader( if d in self.axes_to_reindex: if d == self.major_axis: assert self.context is not None - col = tiledbsoma_build_index( - coords[self.major_axis], - context=self.context, + col = IntIndexer( + coords[self.major_axis], context=self.context ).get_indexer( col.to_numpy(), ) @@ -337,9 +333,11 @@ def _create_reader(self) -> Iterator[BlockwiseScipyReadIterResult]: """ Private. Iterator over SparseNDArray producing sequence of scipy sparse matrix. """ - yield from self._cs_reader( - _pool=self._threadpool - ) if self.compress else self._coo_reader(_pool=self._threadpool) + yield from ( + self._cs_reader(_pool=self._threadpool) + if self.compress + else self._coo_reader(_pool=self._threadpool) + ) def _sorted_tbl_reader( self, _pool: Optional[ThreadPoolExecutor] = None diff --git a/apis/python/src/tiledbsoma/reindexer.cc b/apis/python/src/tiledbsoma/reindexer.cc index 0e9b2b2da1..025325a73e 100644 --- a/apis/python/src/tiledbsoma/reindexer.cc +++ b/apis/python/src/tiledbsoma/reindexer.cc @@ -147,16 +147,10 @@ void load_reindexer(py::module& m) { // Perform lookup for a large input array of keys and writes the // looked up values into previously allocated array (works for the // cases in which python and R pre-allocate the array) - .def( - "get_indexer", - [](IntIndexer& indexer, py::array_t lookups) { - return get_indexer_general(indexer, lookups); - }) + .def("get_indexer_general", get_indexer_general) // If the input is not arrow (does not have _export_to_c attribute), // it will be handled using a general input method. - .def("get_indexer", [](IntIndexer& indexer, py::object py_arrow_array) { - return get_indexer_py_arrow(indexer, py_arrow_array); - }); + .def("get_indexer_pyarrow", get_indexer_py_arrow); } } // namespace libtiledbsomacpp diff --git a/apis/python/tests/test_indexer.py b/apis/python/tests/test_indexer.py index fa3908e673..b182bc66ac 100644 --- a/apis/python/tests/test_indexer.py +++ b/apis/python/tests/test_indexer.py @@ -6,7 +6,7 @@ import pyarrow as pa import pytest -from tiledbsoma._index_util import tiledbsoma_build_index +from tiledbsoma._indexer import IntIndexer from tiledbsoma.options import SOMATileDBContext from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context @@ -23,7 +23,7 @@ def test_duplicate_key_indexer_error( ): context = _validate_soma_tiledb_context(SOMATileDBContext()) with pytest.raises(RuntimeError, match="There are duplicate keys."): - tiledbsoma_build_index(keys, context=context) + IntIndexer(keys, context=context) pd_index = pd.Index(keys) with pytest.raises(pd.errors.InvalidIndexError): @@ -101,7 +101,7 @@ def test_indexer(contextual: bool, keys: np.array, lookups: np.array): num_threads = 10 def target(): - indexer = tiledbsoma_build_index(keys, context=context) + indexer = IntIndexer(keys, context=context) results = indexer.get_indexer(lookups) all_results.append(results) diff --git a/apis/python/tests/test_reindexer_api.py b/apis/python/tests/test_reindexer_api.py index 9f24fb113f..1e876397ff 100644 --- a/apis/python/tests/test_reindexer_api.py +++ b/apis/python/tests/test_reindexer_api.py @@ -1,22 +1,16 @@ -import numpy as np - -from tiledbsoma import SOMATileDBContext, tiledbsoma_build_index +from typing import Optional +import numpy as np +import pytest -def test_reindexer_api_thread_count(): - keys = np.arange(3, 10, 2) - ids = np.arange(3, 10, 2) - expected = np.array([0, 1, 2, 3]) - indexer = tiledbsoma_build_index(keys) - result = indexer.get_indexer(ids) - assert np.equal(result.all(), expected.all()) +from tiledbsoma import IntIndexer, SOMATileDBContext -def test_reindexer_api_context(): - context = SOMATileDBContext() +@pytest.mark.parametrize("context", [None, SOMATileDBContext()]) +def test_reindexer_api(context: Optional[SOMATileDBContext]): keys = np.arange(3, 10, 2) ids = np.arange(3, 10, 2) expected = np.array([0, 1, 2, 3]) - indexer = tiledbsoma_build_index(keys, context=context) + indexer = IntIndexer(keys, context=context) result = indexer.get_indexer(ids) assert np.equal(result.all(), expected.all()) diff --git a/libtiledbsoma/src/reindexer/reindexer.h b/libtiledbsoma/src/reindexer/reindexer.h index 5b473a0363..918c34e21e 100644 --- a/libtiledbsoma/src/reindexer/reindexer.h +++ b/libtiledbsoma/src/reindexer/reindexer.h @@ -94,4 +94,4 @@ class IntIndexer { } // namespace tiledbsoma -#endif // TILEDBSOMA_REINDEXER_H \ No newline at end of file +#endif // TILEDBSOMA_REINDEXER_H