From c0a4d8b067f24a8a1a1a866698182ae4143eda69 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:01:19 -0700 Subject: [PATCH] Adding expansion concepts for storing data in a table --- docs/reference/oop/table_refactor.md | 3 + .../models/protocols/table_protocol.py | 44 +++++++- synapseclient/models/table.py | 100 +++++++++++++++++- 3 files changed, 140 insertions(+), 7 deletions(-) diff --git a/docs/reference/oop/table_refactor.md b/docs/reference/oop/table_refactor.md index febee410f..5cd14346f 100644 --- a/docs/reference/oop/table_refactor.md +++ b/docs/reference/oop/table_refactor.md @@ -22,6 +22,9 @@ client. - get_acl - set_permissions +::: synapseclient.models.table.SchemaStorageStrategy +::: synapseclient.models.table.ColumnExpansionStrategy + ::: synapseclient.models.FacetType ::: synapseclient.models.ColumnType ::: synapseclient.models.table.JsonSubColumn diff --git a/synapseclient/models/protocols/table_protocol.py b/synapseclient/models/protocols/table_protocol.py index 9dcda94f4..6c86cee39 100644 --- a/synapseclient/models/protocols/table_protocol.py +++ b/synapseclient/models/protocols/table_protocol.py @@ -9,7 +9,12 @@ from synapseclient import Synapse if TYPE_CHECKING: - from synapseclient.models.table import Row, Table + from synapseclient.models.table import ( + ColumnExpansionStrategy, + Row, + SchemaStorageStrategy, + Table, + ) class ColumnSynchronousProtocol(Protocol): @@ -54,6 +59,8 @@ def store( def store_rows( self, values: Union[str, List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], + schema_storage_strategy: "SchemaStorageStrategy" = None, + column_expansion_strategy: "ColumnExpansionStrategy" = None, *, synapse_client: Optional[Synapse] = None, ) -> None: @@ -68,6 +75,39 @@ def store_rows( - A dictionary where the key is the column name and the value is one or more values. The values will be wrapped into a [Pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/api.html#dataframe). - A [Pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/api.html#dataframe) + schema_storage_strategy: + (Default): SchemaStorageStrategy.INFER_FROM_DATA + + Determines how to automate the creation of columns + based on the data that is being stored. If you want to have full + control over the schema you may set this to `None` and create + the columns manually. + + The limitation with this behavior is that the columns created may only + be of the following types: + + - STRING + - LARGETEXT + - INTEGER + - DOUBLE + - BOOLEAN + - DATE + + The determination is based on how this pandas function infers the + data type: [infer_dtype](https://pandas.pydata.org/docs/reference/api/pandas.api.types.infer_dtype.html) + + column_expansion_strategy: + (Default): ColumnExpansionStrategy.AUTO_EXPAND_CONTENT_AND_LIST_LENGTH + + Determines how to automate the expansion of + columns based on the data that is being stored. The options given allow + cells with a limit on the length of content (Such as strings) or cells + with a limit on the number of values (Such as lists) to be expanded to + a larger size if the data being stored exceeds the limit. If you want to + have full control over the schema you may set this to `None` and create + the columns manually. + TODO: When implementing this feature more verbose documentation on exactly what columns types may be expanded + synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. @@ -91,7 +131,7 @@ def delete_rows( Returns: None - # TODO: Add example of how to delete rows + TODO: Add example of how to delete rows """ return None diff --git a/synapseclient/models/table.py b/synapseclient/models/table.py index aadbd5eb0..f0eb5d383 100644 --- a/synapseclient/models/table.py +++ b/synapseclient/models/table.py @@ -303,6 +303,63 @@ async def store_async( return self +class SchemaStorageStrategy(str, Enum): + """Enum used""" + + INFER_FROM_DATA = "INFER_FROM_DATA" + """ + (Default) + Allow the data to define which columns are created on the Synapse table + automatically. The limitation with this behavior is that the columns created may + only be of the following types: + + - STRING + - LARGETEXT + - INTEGER + - DOUBLE + - BOOLEAN + - DATE + + The determination of the column type is based on the data that is passed in + using the pandas function + [infer_dtype](https://pandas.pydata.org/docs/reference/api/pandas.api.types.infer_dtype.html). + If you need a more specific column type, or need to add options to the colums + follow the examples shown in the [Table][synapseclient.models.Table] class. + + + The columns created as a result of this strategy will be appended to the end of the + existing columns if the table already exists. + """ + + +class ColumnExpansionStrategy(str, Enum): + """ + Determines how to automate the expansion of columns based on the data + that is being stored. The options given allow cells with a limit on the length of + content (Such as strings) or cells with a limit on the number of values (Such as + lists) to be expanded to a larger size if the data being stored exceeds the limit. + """ + + AUTO_EXPAND_CONTENT_AND_LIST_LENGTH = "AUTO_EXPAND_CONTENT_AND_LIST_LENGTH" + """ + (Default) + Automatically expand both the content length and list length of columns if the data + being stored exceeds the limit. + """ + + AUTO_EXPAND_CONTENT_LENGTH = "AUTO_EXPAND_CONTENT_LENGTH" + """ + Automatically expand the content length of columns if the data being stored exceeds + the limit. + """ + + AUTO_EXPAND_LIST_LENGTH = "AUTO_EXPAND_LIST_LENGTH" + """ + Automatically expand the list length of columns if the data being stored exceeds + the limit. + """ + + @dataclass() @async_to_sync class Table(TableSynchronousProtocol, AccessControllable): @@ -805,6 +862,8 @@ def fill_from_dict( async def store_rows_async( self, values: Union[str, List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], + schema_storage_strategy: SchemaStorageStrategy = SchemaStorageStrategy.INFER_FROM_DATA, + column_expansion_strategy: ColumnExpansionStrategy = ColumnExpansionStrategy.AUTO_EXPAND_CONTENT_AND_LIST_LENGTH, *, synapse_client: Optional[Synapse] = None, ) -> None: @@ -819,6 +878,34 @@ async def store_rows_async( - A dictionary where the key is the column name and the value is one or more values. The values will be wrapped into a [Pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/api.html#dataframe). - A [Pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/api.html#dataframe) + schema_storage_strategy: Determines how to automate the creation of columns + based on the data that is being stored. If you want to have full + control over the schema you may set this to `None` and create + the columns manually. + + The limitation with this behavior is that the columns created may only + be of the following types: + + - STRING + - LARGETEXT + - INTEGER + - DOUBLE + - BOOLEAN + - DATE + + The determination is based on how this pandas function infers the + data type: [infer_dtype](https://pandas.pydata.org/docs/reference/api/pandas.api.types.infer_dtype.html) + + column_expansion_strategy: Determines how to automate the expansion of + columns based on the data that is being stored. The options given allow + cells with a limit on the length of content (Such as strings) or cells + with a limit on the number of values (Such as lists) to be expanded to + a larger size if the data being stored exceeds the limit. If you want to + have full control over the schema you may set this to `None` and create + the columns manually. + + TODO: When implementing this feature more verbose documentation on exactly what columns types may be expanded + synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. @@ -827,10 +914,13 @@ async def store_rows_async( None """ client = Synapse.get_client(synapse_client=synapse_client) - client.logger.info( - f"Checking for changes to the schema of table: {self.name or self.id}" - ) - await self.store_async(synapse_client=synapse_client) + + # TODO: Implement logic to handle schema changes and expansion of columns + if schema_storage_strategy == SchemaStorageStrategy.INFER_FROM_DATA: + client.logger.info( + f"Checking for changes to the schema of table: {self.name or self.id}" + ) + await self.store_async(synapse_client=synapse_client) client.logger.info(f"Storing rows for table {self.name or self.id}") @@ -880,7 +970,7 @@ async def delete_rows_async( Returns: None - # TODO: Add example of how to delete rows + TODO: Add example of how to delete rows """ rows_to_delete = [] for row in rows: