From 27f52af3ddbb8224e1296bf18a0e1aba27e7692f Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 19 Apr 2024 14:58:24 -0400 Subject: [PATCH] [python] Ingestion performance plus robust URI handling (#2457) --- apis/python/src/tiledbsoma/__init__.py | 8 +++- apis/python/src/tiledbsoma/_collection.py | 6 +++ .../python/src/tiledbsoma/_common_nd_array.py | 11 ++++- apis/python/src/tiledbsoma/_dataframe.py | 11 ++++- apis/python/src/tiledbsoma/_exception.py | 45 +++++++++++++++++++ apis/python/src/tiledbsoma/io/ingest.py | 9 ++-- 6 files changed, 83 insertions(+), 7 deletions(-) diff --git a/apis/python/src/tiledbsoma/__init__.py b/apis/python/src/tiledbsoma/__init__.py index b33263f1c2..946f9cba40 100644 --- a/apis/python/src/tiledbsoma/__init__.py +++ b/apis/python/src/tiledbsoma/__init__.py @@ -147,7 +147,12 @@ from ._constants import SOMA_JOINID from ._dataframe import DataFrame from ._dense_nd_array import DenseNDArray -from ._exception import AlreadyExistsError, DoesNotExistError, SOMAError +from ._exception import ( + AlreadyExistsError, + DoesNotExistError, + NotCreateableError, + SOMAError, +) from ._experiment import Experiment from ._factory import open from ._general_utilities import ( @@ -192,6 +197,7 @@ "get_storage_engine", "IntIndexer", "Measurement", + "NotCreateableError", "open", "ResultOrder", "show_package_versions", diff --git a/apis/python/src/tiledbsoma/_collection.py b/apis/python/src/tiledbsoma/_collection.py index 922e2f9817..9e04c8f08f 100644 --- a/apis/python/src/tiledbsoma/_collection.py +++ b/apis/python/src/tiledbsoma/_collection.py @@ -39,9 +39,11 @@ from ._dense_nd_array import DenseNDArray from ._exception import ( AlreadyExistsError, + NotCreateableError, SOMAError, is_already_exists_error, is_does_not_exist_error, + is_not_createable_error, ) from ._funcs import typeguard_ignore from ._sparse_nd_array import SparseNDArray @@ -119,6 +121,8 @@ def create( Raises: tiledbsoma.AlreadyExistsError: If the underlying object already exists at the given URI. + tiledbsoma.NotCreateableError: + If the URI is malformed for a particular storage backend. TileDBError: If unable to create the underlying object. @@ -137,6 +141,8 @@ def create( except tiledb.TileDBError as tdbe: if is_already_exists_error(tdbe): raise AlreadyExistsError(f"{uri!r} already exists") + if is_not_createable_error(tdbe): + raise NotCreateableError(f"{uri!r} cannot be created") raise @classmethod diff --git a/apis/python/src/tiledbsoma/_common_nd_array.py b/apis/python/src/tiledbsoma/_common_nd_array.py index adc6e8c94a..ee2f4f83ff 100644 --- a/apis/python/src/tiledbsoma/_common_nd_array.py +++ b/apis/python/src/tiledbsoma/_common_nd_array.py @@ -16,7 +16,12 @@ import tiledb from . import _arrow_types, _util -from ._exception import AlreadyExistsError, is_already_exists_error +from ._exception import ( + AlreadyExistsError, + NotCreateableError, + is_already_exists_error, + is_not_createable_error, +) from ._tiledb_array import TileDBArray from ._types import OpenTimestamp from .options._soma_tiledb_context import ( @@ -80,6 +85,8 @@ def create( If the ``shape`` is unsupported. tiledbsoma.AlreadyExistsError: If the underlying object already exists at the given URI. + tiledbsoma.NotCreateableError: + If the URI is malformed for a particular storage backend. TileDBError: If unable to create the underlying object. @@ -103,6 +110,8 @@ def create( except tiledb.TileDBError as tdbe: if is_already_exists_error(tdbe): raise AlreadyExistsError(f"{uri!r} already exists") + if is_not_createable_error(tdbe): + raise NotCreateableError(f"{uri!r} cannot be created") raise @property diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 76c76cb691..a7aa482786 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -20,7 +20,12 @@ from . import _arrow_types, _util from . import pytiledbsoma as clib from ._constants import SOMA_JOINID -from ._exception import AlreadyExistsError, is_already_exists_error +from ._exception import ( + AlreadyExistsError, + NotCreateableError, + is_already_exists_error, + is_not_createable_error, +) from ._query_condition import QueryCondition from ._read_iters import TableReadIter from ._tdb_handles import DataFrameWrapper @@ -190,6 +195,8 @@ def create( If the ``schema`` specifies illegal column names. tiledbsoma.AlreadyExistsError: If the underlying object already exists at the given URI. + tiledbsoma.NotCreateableError: + If the URI is malformed for a particular storage backend. TileDBError: If unable to create the underlying object. @@ -229,6 +236,8 @@ def create( except tiledb.TileDBError as tdbe: if is_already_exists_error(tdbe): raise AlreadyExistsError(f"{uri!r} already exists") + if is_not_createable_error(tdbe): + raise NotCreateableError(f"{uri!r} cannot be created") raise def keys(self) -> Tuple[str, ...]: diff --git a/apis/python/src/tiledbsoma/_exception.py b/apis/python/src/tiledbsoma/_exception.py index 10a024855d..e1b60a9b52 100644 --- a/apis/python/src/tiledbsoma/_exception.py +++ b/apis/python/src/tiledbsoma/_exception.py @@ -85,6 +85,51 @@ def is_already_exists_error(e: tiledb.TileDBError) -> bool: return "already exists" in stre.lower() +class NotCreateableError(SOMAError): + """Raised when attempting to create an already existing SOMA object. + + Lifecycle: experimental + """ + + pass + + +def is_not_createable_error(e: tiledb.TileDBError) -> bool: + """Given a TileDBError, return true if it indicates the object cannot be created + + Lifecycle: experimental + + Example: + try: + tiledb.Array.create(uri, schema, ctx=ctx) + ... + except tiledb.TileDBError as e: + if is_not_createable_error(e): + ... + raise e + """ + stre = str(e) + # Context: + # * A recurring paradigm in tiledbsoma.io is open for write (if exists) else create -- + # or, equivalently, create (if doesn't already exist), else open for write + # * A priori either seems fine + # * There are performance implications for trying the create first: when an + # object _does_ already exist we get that quickly. + # * Therefore it's more performant to try-create-catch-open-for-write + # * However we have the following semantics for cloud URIs: + # o For writes: must be "creation URIs" of the form "tiledb://namespace/s3://bucket/some/path" + # o For read: can be "creation URIs" _or_ non-creation URIs of the form + # "tiledb://namespace/groupname" or "tiledb://namespace/uuid" + # * Put together: when we try-create-catch-open-for-write, _and when_ the URI provided + # is a non-creation URI, we need to catch that fact and treat it as a non-error. + stre = stre.lower() + if "storage backend local not supported" in stre: + return True + if "storage backend not supported: local" in stre: + return True + return False + + def is_duplicate_group_key_error(e: tiledb.TileDBError) -> bool: """Given a TileDBError, return try if it indicates a duplicate member add request in a tiledb.Group. diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index d7aa4394e2..8fa653ec30 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -61,6 +61,7 @@ from .._exception import ( AlreadyExistsError, DoesNotExistError, + NotCreateableError, SOMAError, ) from .._tdb_handles import RawHandle @@ -989,7 +990,7 @@ def _create_or_open_collection( ) -> CollectionBase[_TDBO]: try: coll = cls.create(uri, context=context) - except AlreadyExistsError: + except (AlreadyExistsError, NotCreateableError): # It already exists. Are we resuming? if ingestion_params.error_if_already_exists: raise SOMAError(f"{uri} already exists") @@ -1200,7 +1201,7 @@ def _write_dataframe_impl( platform_config=platform_config, context=context, ) - except AlreadyExistsError: + except (AlreadyExistsError, NotCreateableError): if ingestion_params.error_if_already_exists: raise SOMAError(f"{soma_df.uri} already exists") @@ -1301,7 +1302,7 @@ def _create_from_matrix( platform_config=platform_config, context=context, ) - except AlreadyExistsError: + except (AlreadyExistsError, NotCreateableError): if ingestion_params.error_if_already_exists: raise SOMAError(f"{soma_ndarray.uri} already exists") soma_ndarray = cls.open( @@ -2756,7 +2757,7 @@ def _ingest_uns_ndarray( platform_config=platform_config, context=context, ) - except AlreadyExistsError: + except (AlreadyExistsError, NotCreateableError): soma_arr = DenseNDArray.open(arr_uri, "w", context=context) # If resume mode: don't re-write existing data. This is the user's explicit request