From 8f30bc054428692c47473adc1e93ce3aa362a8ca Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Tue, 16 Jul 2024 18:21:09 -0400 Subject: [PATCH 1/9] Accept dict for client 'write_dataframe' --- tiled/client/container.py | 2 ++ tiled/serialization/table.py | 5 ++++- tiled/structures/table.py | 9 +++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tiled/client/container.py b/tiled/client/container.py index 53719dfb8..c464e826d 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -942,6 +942,8 @@ def write_dataframe( if isinstance(dataframe, dask.dataframe.DataFrame): structure = TableStructure.from_dask_dataframe(dataframe) + elif isinstance(dataframe, dict): + structure = TableStructure.from_pydict(dataframe) else: structure = TableStructure.from_pandas(dataframe) client = self.new( diff --git a/tiled/serialization/table.py b/tiled/serialization/table.py index 999ce1aa7..b339e6b40 100644 --- a/tiled/serialization/table.py +++ b/tiled/serialization/table.py @@ -10,7 +10,10 @@ def serialize_arrow(df, metadata, preserve_index=True): import pyarrow - table = pyarrow.Table.from_pandas(df, preserve_index=preserve_index) + if isinstance(df, dict): + table = pyarrow.Table.from_pydict(df) + else: + table = pyarrow.Table.from_pandas(df, preserve_index=preserve_index) sink = pyarrow.BufferOutputStream() with pyarrow.ipc.new_file(sink, table.schema) as writer: writer.write_table(table) diff --git a/tiled/structures/table.py b/tiled/structures/table.py index 81a35d5c4..984ee44e6 100644 --- a/tiled/structures/table.py +++ b/tiled/structures/table.py @@ -47,6 +47,15 @@ def from_pandas(cls, df): data_uri = B64_ENCODED_PREFIX + schema_b64 return cls(arrow_schema=data_uri, npartitions=1, columns=list(df.columns)) + @classmethod + def from_pydict(cls, pd): + import pyarrow + + schema_bytes = pyarrow.Table.from_pydict(pd).schema.serialize() + schema_b64 = base64.b64encode(schema_bytes).decode("utf-8") + data_uri = B64_ENCODED_PREFIX + schema_b64 + return cls(arrow_schema=data_uri, npartitions=1, columns=list(pd.keys())) + @property def arrow_schema_decoded(self): import pyarrow From 6187f445a8041d9f8fcd66d11d44022411a7a412 Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Tue, 16 Jul 2024 18:41:15 -0400 Subject: [PATCH 2/9] Add test for writing dataframe from dict --- tiled/_tests/test_writing.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tiled/_tests/test_writing.py b/tiled/_tests/test_writing.py index 66cd8516b..4fbbfb7ed 100644 --- a/tiled/_tests/test_writing.py +++ b/tiled/_tests/test_writing.py @@ -174,6 +174,32 @@ def test_write_dataframe_partitioned(tree): assert result.specs == specs +# @pytest.mark.filterwarnings(f"ignore:{WARNING_PANDAS_BLOCKS}:DeprecationWarning") +def test_write_dataframe_dict(tree): + with Context.from_app( + build_app(tree, validation_registry=validation_registry) + ) as context: + client = from_context(context) + + data = {f"Column{i}": (1 + i) * numpy.ones(5) for i in range(5)} + df = pandas.DataFrame(data) + metadata = {"scan_id": 1, "method": "A"} + specs = [Spec("SomeSpec")] + + with record_history() as history: + client.write_dataframe(data, metadata=metadata, specs=specs) + # one request for metadata, one for data + assert len(history.requests) == 1 + 1 + + results = client.search(Key("scan_id") == 1) + result = results.values().first() + result_dataframe = result.read() + + pandas.testing.assert_frame_equal(result_dataframe, df) + assert result.metadata == metadata + assert result.specs == specs + + @pytest.mark.parametrize( "coo", [ From 44d6a932be0f0818eb6ec43f1a85c9fcffc33bc3 Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Wed, 17 Jul 2024 18:33:19 -0400 Subject: [PATCH 3/9] Better generic dict name in TableStructure --- tiled/structures/table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tiled/structures/table.py b/tiled/structures/table.py index 984ee44e6..322380bd3 100644 --- a/tiled/structures/table.py +++ b/tiled/structures/table.py @@ -48,13 +48,13 @@ def from_pandas(cls, df): return cls(arrow_schema=data_uri, npartitions=1, columns=list(df.columns)) @classmethod - def from_pydict(cls, pd): + def from_pydict(cls, d): import pyarrow - schema_bytes = pyarrow.Table.from_pydict(pd).schema.serialize() + schema_bytes = pyarrow.Table.from_pydict(d).schema.serialize() schema_b64 = base64.b64encode(schema_bytes).decode("utf-8") data_uri = B64_ENCODED_PREFIX + schema_b64 - return cls(arrow_schema=data_uri, npartitions=1, columns=list(pd.keys())) + return cls(arrow_schema=data_uri, npartitions=1, columns=list(d.keys())) @property def arrow_schema_decoded(self): From efb355ff2ad6c3cbe790b2da2223fa07c4a58004 Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Wed, 17 Jul 2024 18:50:00 -0400 Subject: [PATCH 4/9] Add support for dict to TableAdapter --- tiled/adapters/table.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tiled/adapters/table.py b/tiled/adapters/table.py index b169ca05d..cf30cd3fd 100644 --- a/tiled/adapters/table.py +++ b/tiled/adapters/table.py @@ -57,6 +57,38 @@ def from_pandas( ddf, metadata=metadata, specs=specs, access_policy=access_policy ) + @classmethod + def from_pydict( + cls, + *args: Any, + metadata: Optional[JSON] = None, + specs: Optional[List[Spec]] = None, + access_policy: Optional[AccessPolicy] = None, + npartitions: int = 1, + **kwargs: Any, + ) -> "TableAdapter": + """ + + Parameters + ---------- + args : + metadata : + specs : + access_policy : + npartitions : + kwargs : + + Returns + ------- + + """ + ddf = dask.dataframe.from_dict(*args, npartitions=npartitions, **kwargs) + if specs is None: + specs = [Spec("dataframe")] + return cls.from_dask_dataframe( + ddf, metadata=metadata, specs=specs, access_policy=access_policy + ) + @classmethod def from_dask_dataframe( cls, From 20f0b08d974c2631fa7ec8f8246c5afac6076c9c Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Wed, 17 Jul 2024 19:21:43 -0400 Subject: [PATCH 5/9] Simplify generated_minimal example --- tiled/examples/generated_minimal.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tiled/examples/generated_minimal.py b/tiled/examples/generated_minimal.py index bee425dce..e44ab59e5 100644 --- a/tiled/examples/generated_minimal.py +++ b/tiled/examples/generated_minimal.py @@ -1,5 +1,4 @@ import numpy -import pandas import xarray from tiled.adapters.array import ArrayAdapter @@ -11,14 +10,12 @@ { "A": ArrayAdapter.from_array(numpy.ones((100, 100))), "B": ArrayAdapter.from_array(numpy.ones((100, 100, 100))), - "C": DataFrameAdapter.from_pandas( - pandas.DataFrame( - { - "x": 1 * numpy.ones(100), - "y": 2 * numpy.ones(100), - "z": 3 * numpy.ones(100), - } - ), + "C": DataFrameAdapter.from_pydict( + { + "x": 1 * numpy.ones(100), + "y": 2 * numpy.ones(100), + "z": 3 * numpy.ones(100), + }, npartitions=3, ), "D": DatasetAdapter.from_dataset( From f19f89fbd14df31e8390b4c04be0c242f76f56b5 Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Wed, 17 Jul 2024 19:49:50 -0400 Subject: [PATCH 6/9] Use newer TableAdapter name, rather than alias --- tiled/examples/generated_minimal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiled/examples/generated_minimal.py b/tiled/examples/generated_minimal.py index e44ab59e5..a114e3749 100644 --- a/tiled/examples/generated_minimal.py +++ b/tiled/examples/generated_minimal.py @@ -2,7 +2,7 @@ import xarray from tiled.adapters.array import ArrayAdapter -from tiled.adapters.dataframe import DataFrameAdapter +from tiled.adapters.dataframe import TableAdapter from tiled.adapters.mapping import MapAdapter from tiled.adapters.xarray import DatasetAdapter @@ -10,7 +10,7 @@ { "A": ArrayAdapter.from_array(numpy.ones((100, 100))), "B": ArrayAdapter.from_array(numpy.ones((100, 100, 100))), - "C": DataFrameAdapter.from_pydict( + "C": TableAdapter.from_pydict( { "x": 1 * numpy.ones(100), "y": 2 * numpy.ones(100), From a46a9687caa8a7271026bb9a419250456d36d6ff Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Wed, 17 Jul 2024 19:50:02 -0400 Subject: [PATCH 7/9] Update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d31c4d342..7ba645001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Write the date in place of the "Unreleased" in the case a new version is release ## Unreleased +### Added +- Add method to `TableAdapter` which accepts a Python dictionary. + +### Changed +- Make `tiled.client` accept a Python dictionary when fed to `write_dataframe()`. +- The `generated_minimal` example no longer requires pandas and instead uses a Python dict. + ### Fixed - A bug in `Context.__getstate__` caused picking to fail if applied twice. From 267790678e0ab0bb9c76136f594dcd7bd623d3fe Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Fri, 26 Jul 2024 13:57:23 -0400 Subject: [PATCH 8/9] Rename from_dict methods --- tiled/adapters/table.py | 2 +- tiled/client/container.py | 2 +- tiled/examples/generated_minimal.py | 2 +- tiled/structures/table.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tiled/adapters/table.py b/tiled/adapters/table.py index cf30cd3fd..4b7d3ff0c 100644 --- a/tiled/adapters/table.py +++ b/tiled/adapters/table.py @@ -58,7 +58,7 @@ def from_pandas( ) @classmethod - def from_pydict( + def from_dict( cls, *args: Any, metadata: Optional[JSON] = None, diff --git a/tiled/client/container.py b/tiled/client/container.py index c464e826d..7a87ce507 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -943,7 +943,7 @@ def write_dataframe( if isinstance(dataframe, dask.dataframe.DataFrame): structure = TableStructure.from_dask_dataframe(dataframe) elif isinstance(dataframe, dict): - structure = TableStructure.from_pydict(dataframe) + structure = TableStructure.from_dict(dataframe) else: structure = TableStructure.from_pandas(dataframe) client = self.new( diff --git a/tiled/examples/generated_minimal.py b/tiled/examples/generated_minimal.py index a114e3749..38774f1a5 100644 --- a/tiled/examples/generated_minimal.py +++ b/tiled/examples/generated_minimal.py @@ -10,7 +10,7 @@ { "A": ArrayAdapter.from_array(numpy.ones((100, 100))), "B": ArrayAdapter.from_array(numpy.ones((100, 100, 100))), - "C": TableAdapter.from_pydict( + "C": TableAdapter.from_dict( { "x": 1 * numpy.ones(100), "y": 2 * numpy.ones(100), diff --git a/tiled/structures/table.py b/tiled/structures/table.py index 322380bd3..8cf6de0f1 100644 --- a/tiled/structures/table.py +++ b/tiled/structures/table.py @@ -48,7 +48,7 @@ def from_pandas(cls, df): return cls(arrow_schema=data_uri, npartitions=1, columns=list(df.columns)) @classmethod - def from_pydict(cls, d): + def from_dict(cls, d): import pyarrow schema_bytes = pyarrow.Table.from_pydict(d).schema.serialize() From 7a19dd86393ef875d04b399ce0b1bf1b06a23f85 Mon Sep 17 00:00:00 2001 From: Nate Maytan Date: Fri, 26 Jul 2024 14:23:15 -0400 Subject: [PATCH 9/9] Remove commented ignore for Pandas warning from new test --- tiled/_tests/test_writing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tiled/_tests/test_writing.py b/tiled/_tests/test_writing.py index 4fbbfb7ed..61a0bf47f 100644 --- a/tiled/_tests/test_writing.py +++ b/tiled/_tests/test_writing.py @@ -174,7 +174,6 @@ def test_write_dataframe_partitioned(tree): assert result.specs == specs -# @pytest.mark.filterwarnings(f"ignore:{WARNING_PANDAS_BLOCKS}:DeprecationWarning") def test_write_dataframe_dict(tree): with Context.from_app( build_app(tree, validation_registry=validation_registry)