Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.8] [python] Implement IntIndexer as class that wraps around clib.IntIndexer #2327

Merged
merged 2 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ repos:
- id: mypy
additional_dependencies:
- "pandas-stubs==1.5.3.230214"
- "somacore==1.0.9"
- "somacore==1.0.10"
- "types-setuptools==67.4.0.3"
args: ["--config-file=apis/python/pyproject.toml", "apis/python/src", "apis/python/devtools"]
pass_filenames: false
2 changes: 1 addition & 1 deletion apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def run(self):
"pyarrow>=9.0.0; platform_system!='Darwin'",
"scanpy>=1.9.2",
"scipy",
"somacore==1.0.9",
"somacore==1.0.10",
"tiledb~=0.26.0",
"typing-extensions", # Note "-" even though `import typing_extensions`
],
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
get_storage_engine,
show_package_versions,
)
from ._index_util import tiledbsoma_build_index
from ._indexer import IntIndexer
from ._measurement import Measurement
from ._sparse_nd_array import SparseNDArray
from .options import SOMATileDBContext, TileDBCreateOptions
Expand All @@ -173,7 +173,6 @@
__all__ = [
"AxisColumnNames",
"AxisQuery",
"tiledbsoma_build_index",
"Collection",
"DataFrame",
"DenseNDArray",
Expand All @@ -184,6 +183,7 @@
"get_implementation",
"get_SOMA_version",
"get_storage_engine",
"IntIndexer",
"Measurement",
"open",
"ResultOrder",
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ._collection import Collection, CollectionBase
from ._dataframe import DataFrame
from ._index_util import tiledbsoma_build_index
from ._indexer import IntIndexer
from ._measurement import Measurement
from ._tdb_handles import Wrapper
from ._tiledb_object import AnyTileDBObject
Expand Down Expand Up @@ -95,7 +95,7 @@ def axis_query( # type: ignore
obs_query=obs_query or query.AxisQuery(),
var_query=var_query or query.AxisQuery(),
index_factory=functools.partial(
tiledbsoma_build_index,
IntIndexer,
context=self.context,
),
)
52 changes: 0 additions & 52 deletions apis/python/src/tiledbsoma/_index_util.py

This file was deleted.

65 changes: 65 additions & 0 deletions apis/python/src/tiledbsoma/_indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, List, Optional, Union

import numpy as np
import numpy.typing as npt
import pandas as pd
import pyarrow as pa

from tiledbsoma import pytiledbsoma as clib

if TYPE_CHECKING:
from .options import SOMATileDBContext

Check warning on line 13 in apis/python/src/tiledbsoma/_indexer.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_indexer.py#L13

Added line #L13 was not covered by tests

IndexerDataType = Union[
npt.NDArray[np.int64],
pa.Array,
pa.IntegerArray,
pd.Series,
pd.arrays.IntegerArray,
pa.ChunkedArray,
List[int],
]


class IntIndexer:
"""A re-indexer for unique integer indices.

Lifecycle:
Experimental.
"""

def __init__(
self, data: IndexerDataType, *, context: Optional["SOMATileDBContext"] = None
):
"""Initialize re-indexer for provied indices.

Args:
data:
Integer keys used to build the index (hash) table.
context:
``SOMATileDBContext`` object containing concurrecy level.

Lifecycle:
Experimental.
"""
self._context = context
self._reindexer = clib.IntIndexer(
None if self._context is None else self._context.native_context
)
self._reindexer.map_locations(data)

def get_indexer(self, target: IndexerDataType) -> Any:
"""Compute underlying indices of index for target data.

Compatible with Pandas' Index.get_indexer method.

Args:
target: Data to return re-index data for.
"""
return (
self._reindexer.get_indexer_pyarrow(target)
if isinstance(target, (pa.Array, pa.ChunkedArray))
else self._reindexer.get_indexer_general(target)
)
20 changes: 9 additions & 11 deletions apis/python/src/tiledbsoma/_read_iters.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

from . import _util
from ._exception import SOMAError
from ._index_util import tiledbsoma_build_index
from ._indexer import IntIndexer
from ._types import NTuple
from .options import SOMATileDBContext

Expand Down Expand Up @@ -138,10 +138,7 @@ def __init__(
self.axes_to_reindex = set(range(self.ndim)) - set(self.reindex_disable_on_axis)
assert context is not None
self.minor_axes_indexer = {
d: tiledbsoma_build_index(
self.joinids[d].to_numpy(),
context=context,
)
d: IntIndexer(self.joinids[d].to_numpy(), context=context)
for d in (self.axes_to_reindex - set((self.major_axis,)))
}

Expand Down Expand Up @@ -257,9 +254,8 @@ def _reindexed_table_reader(
if d in self.axes_to_reindex:
if d == self.major_axis:
assert self.context is not None
col = tiledbsoma_build_index(
coords[self.major_axis],
context=self.context,
col = IntIndexer(
coords[self.major_axis], context=self.context
).get_indexer(
col.to_numpy(),
)
Expand Down Expand Up @@ -337,9 +333,11 @@ def _create_reader(self) -> Iterator[BlockwiseScipyReadIterResult]:
"""
Private. Iterator over SparseNDArray producing sequence of scipy sparse matrix.
"""
yield from self._cs_reader(
_pool=self._threadpool
) if self.compress else self._coo_reader(_pool=self._threadpool)
yield from (
self._cs_reader(_pool=self._threadpool)
if self.compress
else self._coo_reader(_pool=self._threadpool)
)

def _sorted_tbl_reader(
self, _pool: Optional[ThreadPoolExecutor] = None
Expand Down
10 changes: 2 additions & 8 deletions apis/python/src/tiledbsoma/reindexer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,10 @@ void load_reindexer(py::module& m) {
// Perform lookup for a large input array of keys and writes the
// looked up values into previously allocated array (works for the
// cases in which python and R pre-allocate the array)
.def(
"get_indexer",
[](IntIndexer& indexer, py::array_t<int64_t> lookups) {
return get_indexer_general(indexer, lookups);
})
.def("get_indexer_general", get_indexer_general)
// If the input is not arrow (does not have _export_to_c attribute),
// it will be handled using a general input method.
.def("get_indexer", [](IntIndexer& indexer, py::object py_arrow_array) {
return get_indexer_py_arrow(indexer, py_arrow_array);
});
.def("get_indexer_pyarrow", get_indexer_py_arrow);
}

} // namespace libtiledbsomacpp
6 changes: 3 additions & 3 deletions apis/python/tests/test_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pyarrow as pa
import pytest

from tiledbsoma._index_util import tiledbsoma_build_index
from tiledbsoma._indexer import IntIndexer
from tiledbsoma.options import SOMATileDBContext
from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context

Expand All @@ -23,7 +23,7 @@ def test_duplicate_key_indexer_error(
):
context = _validate_soma_tiledb_context(SOMATileDBContext())
with pytest.raises(RuntimeError, match="There are duplicate keys."):
tiledbsoma_build_index(keys, context=context)
IntIndexer(keys, context=context)

pd_index = pd.Index(keys)
with pytest.raises(pd.errors.InvalidIndexError):
Expand Down Expand Up @@ -101,7 +101,7 @@ def test_indexer(contextual: bool, keys: np.array, lookups: np.array):
num_threads = 10

def target():
indexer = tiledbsoma_build_index(keys, context=context)
indexer = IntIndexer(keys, context=context)
results = indexer.get_indexer(lookups)
all_results.append(results)

Expand Down
20 changes: 7 additions & 13 deletions apis/python/tests/test_reindexer_api.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
import numpy as np

from tiledbsoma import SOMATileDBContext, tiledbsoma_build_index
from typing import Optional

import numpy as np
import pytest

def test_reindexer_api_thread_count():
keys = np.arange(3, 10, 2)
ids = np.arange(3, 10, 2)
expected = np.array([0, 1, 2, 3])
indexer = tiledbsoma_build_index(keys)
result = indexer.get_indexer(ids)
assert np.equal(result.all(), expected.all())
from tiledbsoma import IntIndexer, SOMATileDBContext


def test_reindexer_api_context():
context = SOMATileDBContext()
@pytest.mark.parametrize("context", [None, SOMATileDBContext()])
def test_reindexer_api(context: Optional[SOMATileDBContext]):
keys = np.arange(3, 10, 2)
ids = np.arange(3, 10, 2)
expected = np.array([0, 1, 2, 3])
indexer = tiledbsoma_build_index(keys, context=context)
indexer = IntIndexer(keys, context=context)
result = indexer.get_indexer(ids)
assert np.equal(result.all(), expected.all())
2 changes: 1 addition & 1 deletion libtiledbsoma/src/reindexer/reindexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,4 @@ class IntIndexer {

} // namespace tiledbsoma

#endif // TILEDBSOMA_REINDEXER_H
#endif // TILEDBSOMA_REINDEXER_H
Loading