Skip to content

Commit

Permalink
[python] Ingestion performance plus robust URI handling (#2457)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl authored Apr 19, 2024
1 parent 4487347 commit c538f21
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 7 deletions.
8 changes: 7 additions & 1 deletion apis/python/src/tiledbsoma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,12 @@
from ._constants import SOMA_JOINID
from ._dataframe import DataFrame
from ._dense_nd_array import DenseNDArray
from ._exception import AlreadyExistsError, DoesNotExistError, SOMAError
from ._exception import (
AlreadyExistsError,
DoesNotExistError,
NotCreateableError,
SOMAError,
)
from ._experiment import Experiment
from ._factory import open
from ._general_utilities import (
Expand Down Expand Up @@ -192,6 +197,7 @@
"get_storage_engine",
"IntIndexer",
"Measurement",
"NotCreateableError",
"open",
"ResultOrder",
"show_package_versions",
Expand Down
6 changes: 6 additions & 0 deletions apis/python/src/tiledbsoma/_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@
from ._dense_nd_array import DenseNDArray
from ._exception import (
AlreadyExistsError,
NotCreateableError,
SOMAError,
is_already_exists_error,
is_does_not_exist_error,
is_not_createable_error,
)
from ._funcs import typeguard_ignore
from ._sparse_nd_array import SparseNDArray
Expand Down Expand Up @@ -119,6 +121,8 @@ def create(
Raises:
tiledbsoma.AlreadyExistsError:
If the underlying object already exists at the given URI.
tiledbsoma.NotCreateableError:
If the URI is malformed for a particular storage backend.
TileDBError:
If unable to create the underlying object.
Expand All @@ -137,6 +141,8 @@ def create(
except tiledb.TileDBError as tdbe:
if is_already_exists_error(tdbe):
raise AlreadyExistsError(f"{uri!r} already exists")
if is_not_createable_error(tdbe):
raise NotCreateableError(f"{uri!r} cannot be created")
raise

@classmethod
Expand Down
11 changes: 10 additions & 1 deletion apis/python/src/tiledbsoma/_common_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
import tiledb

from . import _arrow_types, _util
from ._exception import AlreadyExistsError, is_already_exists_error
from ._exception import (
AlreadyExistsError,
NotCreateableError,
is_already_exists_error,
is_not_createable_error,
)
from ._tiledb_array import TileDBArray
from ._types import OpenTimestamp
from .options._soma_tiledb_context import (
Expand Down Expand Up @@ -80,6 +85,8 @@ def create(
If the ``shape`` is unsupported.
tiledbsoma.AlreadyExistsError:
If the underlying object already exists at the given URI.
tiledbsoma.NotCreateableError:
If the URI is malformed for a particular storage backend.
TileDBError:
If unable to create the underlying object.
Expand All @@ -103,6 +110,8 @@ def create(
except tiledb.TileDBError as tdbe:
if is_already_exists_error(tdbe):
raise AlreadyExistsError(f"{uri!r} already exists")
if is_not_createable_error(tdbe):
raise NotCreateableError(f"{uri!r} cannot be created")
raise

@property
Expand Down
11 changes: 10 additions & 1 deletion apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
from . import _arrow_types, _util
from . import pytiledbsoma as clib
from ._constants import SOMA_JOINID
from ._exception import AlreadyExistsError, is_already_exists_error
from ._exception import (
AlreadyExistsError,
NotCreateableError,
is_already_exists_error,
is_not_createable_error,
)
from ._query_condition import QueryCondition
from ._read_iters import TableReadIter
from ._tdb_handles import DataFrameWrapper
Expand Down Expand Up @@ -190,6 +195,8 @@ def create(
If the ``schema`` specifies illegal column names.
tiledbsoma.AlreadyExistsError:
If the underlying object already exists at the given URI.
tiledbsoma.NotCreateableError:
If the URI is malformed for a particular storage backend.
TileDBError:
If unable to create the underlying object.
Expand Down Expand Up @@ -229,6 +236,8 @@ def create(
except tiledb.TileDBError as tdbe:
if is_already_exists_error(tdbe):
raise AlreadyExistsError(f"{uri!r} already exists")
if is_not_createable_error(tdbe):
raise NotCreateableError(f"{uri!r} cannot be created")
raise

def keys(self) -> Tuple[str, ...]:
Expand Down
45 changes: 45 additions & 0 deletions apis/python/src/tiledbsoma/_exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,51 @@ def is_already_exists_error(e: tiledb.TileDBError) -> bool:
return "already exists" in stre.lower()


class NotCreateableError(SOMAError):
"""Raised when attempting to create an already existing SOMA object.
Lifecycle: experimental
"""

pass


def is_not_createable_error(e: tiledb.TileDBError) -> bool:
"""Given a TileDBError, return true if it indicates the object cannot be created
Lifecycle: experimental
Example:
try:
tiledb.Array.create(uri, schema, ctx=ctx)
...
except tiledb.TileDBError as e:
if is_not_createable_error(e):
...
raise e
"""
stre = str(e)
# Context:
# * A recurring paradigm in tiledbsoma.io is open for write (if exists) else create --
# or, equivalently, create (if doesn't already exist), else open for write
# * A priori either seems fine
# * There are performance implications for trying the create first: when an
# object _does_ already exist we get that quickly.
# * Therefore it's more performant to try-create-catch-open-for-write
# * However we have the following semantics for cloud URIs:
# o For writes: must be "creation URIs" of the form "tiledb://namespace/s3://bucket/some/path"
# o For read: can be "creation URIs" _or_ non-creation URIs of the form
# "tiledb://namespace/groupname" or "tiledb://namespace/uuid"
# * Put together: when we try-create-catch-open-for-write, _and when_ the URI provided
# is a non-creation URI, we need to catch that fact and treat it as a non-error.
stre = stre.lower()
if "storage backend local not supported" in stre:
return True
if "storage backend not supported: local" in stre:
return True
return False


def is_duplicate_group_key_error(e: tiledb.TileDBError) -> bool:
"""Given a TileDBError, return try if it indicates a duplicate member
add request in a tiledb.Group.
Expand Down
9 changes: 5 additions & 4 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from .._exception import (
AlreadyExistsError,
DoesNotExistError,
NotCreateableError,
SOMAError,
)
from .._tdb_handles import RawHandle
Expand Down Expand Up @@ -989,7 +990,7 @@ def _create_or_open_collection(
) -> CollectionBase[_TDBO]:
try:
coll = cls.create(uri, context=context)
except AlreadyExistsError:
except (AlreadyExistsError, NotCreateableError):
# It already exists. Are we resuming?
if ingestion_params.error_if_already_exists:
raise SOMAError(f"{uri} already exists")
Expand Down Expand Up @@ -1200,7 +1201,7 @@ def _write_dataframe_impl(
platform_config=platform_config,
context=context,
)
except AlreadyExistsError:
except (AlreadyExistsError, NotCreateableError):
if ingestion_params.error_if_already_exists:
raise SOMAError(f"{soma_df.uri} already exists")

Expand Down Expand Up @@ -1301,7 +1302,7 @@ def _create_from_matrix(
platform_config=platform_config,
context=context,
)
except AlreadyExistsError:
except (AlreadyExistsError, NotCreateableError):
if ingestion_params.error_if_already_exists:
raise SOMAError(f"{soma_ndarray.uri} already exists")
soma_ndarray = cls.open(
Expand Down Expand Up @@ -2756,7 +2757,7 @@ def _ingest_uns_ndarray(
platform_config=platform_config,
context=context,
)
except AlreadyExistsError:
except (AlreadyExistsError, NotCreateableError):
soma_arr = DenseNDArray.open(arr_uri, "w", context=context)

# If resume mode: don't re-write existing data. This is the user's explicit request
Expand Down

0 comments on commit c538f21

Please sign in to comment.