You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Issue and/or context: As found by @bkmartinjr in Slack.
Here is a repro script to write a SOMADataFrame with attributes of various non-enum and enum types:
cat ./2305.py
#!/usr/bin/env python
import numpy as np
import pandas as pd
import pyarrow as pa
import tiledbsoma as soma
import os, shutil
def main():
fname = "./test_dataframe"
if os.path.exists(fname):
shutil.rmtree(fname)
pandas_df = pd.DataFrame(
{
"soma_joinid": pd.Series([0, 1, 2, 3, 4, 5], dtype=np.int64),
"int_cat": pd.Series([10, 20, 10, 20, 20, 20], dtype="category"),
"int": pd.Series([10, 20, 10, 20, 20, 20]),
"str_cat": pd.Series(["A", "B", "A", "B", "B", "B"], dtype="category"),
"str": pd.Series(["A", "B", "A", "B", "B", "B"]),
"byte_cat": pd.Series([b"A", b"B", b"A", b"B", b"B", b"B"], dtype="category"),
"byte": pd.Series([b"A", b"B", b"A", b"B", b"B", b"B"]),
},
)
print("** Original Pandas schema")
print(pandas_df.dtypes)
for c in pandas_df:
print(f"{c}: {repr(pandas_df[c].dtype)}")
schema = pa.Schema.from_pandas(pandas_df, preserve_index=False)
print("-----")
print("** Arrow schema, derived from Pandas")
print(schema)
print("-----")
print("** Arrow Table derived from pandas")
print(pa.Table.from_pandas(pandas_df, preserve_index=False))
print("-----")
with soma.DataFrame.create(fname, schema=schema) as soma_dataframe:
tbl = pa.Table.from_pandas(pandas_df, preserve_index=False)
soma_dataframe.write(tbl)
with soma.open(fname) as soma_dataframe:
print("**Created TileDB Array schema")
print(soma_dataframe.schema)
df = soma_dataframe.read().concat().to_pandas()
for c in df:
print(f"{c}: {repr(df[c].dtype)}, {repr(pandas_df[c].dtype)}")
if df[c].dtype == 'category':
print(f"Categories dtype: {repr(df[c].cat.categories.dtype)}, {repr(pandas_df[c].cat.categories.dtype)}")
assert df[c].dtype == pandas_df[c].dtype
if df[c].dtype == 'category':
assert df[c].cat.categories.dtype == pandas_df[c].cat.categories.dtype
print(df)
if __name__ == "__main__":
main()
Here is how it reads back from TileDB-Py:
import tiledb
A = tiledb.open("test_dataframe")
print(A.schema)
for i in range(A.schema.nattr):
attr = A.schema.attr(i)
try:
index_type = attr.dtype
value_type = A.enum(attr.name).dtype
print(f"enum name={attr.name} index_type={index_type.name} value_type={value_type.name}")
except tiledb.cc.TileDBError:
pass # not an eum
soma_joinid: int64 not null
int_cat: dictionary<values=int64, indices=int8, ordered=0> not null
int: int64 not null
str_cat: dictionary<values=string, indices=int8, ordered=0> not null
str: large_string not null
byte_cat: dictionary<values=string, indices=int8, ordered=0> not null
byte: large_string not null
The text was updated successfully, but these errors were encountered:
Issue and/or context: As found by @bkmartinjr in Slack.
Here is a repro script to write a
SOMADataFrame
with attributes of various non-enum and enum types:cat ./2305.py
Here is how it reads back from TileDB-Py:
Output from TileDB-Py:
Note that TileDB-Py correctly says
byte_cat
hasvalue_type=bytes
.Here is a repro using TileDB-SOMA to print the Arrow schema:
Output before #2305:
The text was updated successfully, but these errors were encountered: