Skip to content

Commit

Permalink
ENH: Implement arrow string option for various I/O methods (pandas-de…
Browse files Browse the repository at this point in the history
…v#54431)

* ENH: Implement arrow string option for various I/O methods

* ENH: allow opt-in to inferring pyarrow strings

* Remove comments and add tests

* Add string option to arrow parsers

* Update

* Update

* Adjust csv

* Update

* Update

* Add test

* Fix mypy

---------

Co-authored-by: Brock <[email protected]>
  • Loading branch information
phofl and jbrockmendel authored Aug 10, 2023
1 parent 40df396 commit 57c7943
Show file tree
Hide file tree
Showing 14 changed files with 134 additions and 15 deletions.
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,8 @@ def using_copy_on_write() -> bool:
def using_nullable_dtypes() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["nullable_dtypes"]


def using_pyarrow_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]
6 changes: 3 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ from cython cimport (
floating,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs.missing import check_na_tuples_nonequal

import_datetime()
Expand Down Expand Up @@ -2679,9 +2681,7 @@ def maybe_convert_objects(ndarray[object] objects,

elif seen.str_:
if is_string_array(objects, skipna=True):
from pandas._config import get_option
opt = get_option("future.infer_string")
if opt is True:
if using_pyarrow_string_dtype():
import pyarrow as pa

from pandas.core.dtypes.dtypes import ArrowDtype
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import numpy as np

from pandas._config import get_option
from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas._libs.missing import (
Expand Down Expand Up @@ -798,8 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
# coming out as np.str_!

dtype = _dtype_obj
opt = get_option("future.infer_string")
if opt is True:
if using_pyarrow_string_dtype():
import pyarrow as pa

pa_dtype = pa.string()
Expand Down
8 changes: 8 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from typing import Callable

from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand All @@ -21,3 +23,9 @@ def _arrow_dtype_mapping() -> dict:
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
}


def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")

return {pa.string(): pd.ArrowDtype(pa.string())}.get
10 changes: 9 additions & 1 deletion pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
Any,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
Expand All @@ -15,6 +17,7 @@
from pandas.core.api import DataFrame
from pandas.core.shared_docs import _shared_docs

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import get_handle

if TYPE_CHECKING:
Expand Down Expand Up @@ -119,7 +122,7 @@ def read_feather(
with get_handle(
path, "rb", storage_options=storage_options, is_text=False
) as handles:
if dtype_backend is lib.no_default:
if dtype_backend is lib.no_default and not using_pyarrow_string_dtype():
return feather.read_feather(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
Expand All @@ -135,3 +138,8 @@ def read_feather(

elif dtype_backend == "pyarrow":
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)

elif using_pyarrow_string_dtype():
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
raise NotImplementedError
9 changes: 8 additions & 1 deletion pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
Literal,
)

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat import pa_version_under8p0
from pandas.compat._optional import import_optional_dependency
Expand All @@ -24,6 +26,7 @@
import pandas as pd
from pandas.core.indexes.api import default_index

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
get_handle,
is_fsspec_url,
Expand Down Expand Up @@ -132,7 +135,11 @@ def read_orc(
df = pa_table.to_pandas(types_mapper=mapping.get)
return df
else:
return pa_table.to_pandas()
if using_pyarrow_string_dtype():
types_mapper = arrow_string_types_mapper()
else:
types_mapper = None
return pa_table.to_pandas(types_mapper=types_mapper)


def to_orc(
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import warnings
from warnings import catch_warnings

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
Expand All @@ -26,6 +28,7 @@
)
from pandas.core.shared_docs import _shared_docs

from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
IOHandles,
get_handle,
Expand Down Expand Up @@ -252,6 +255,8 @@ def read(
to_pandas_kwargs["types_mapper"] = mapping.get
elif dtype_backend == "pyarrow":
to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa: E501
elif using_pyarrow_string_dtype():
to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()

manager = get_option("mode.data_manager")
if manager == "array":
Expand Down
9 changes: 8 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import TYPE_CHECKING

from pandas._config import using_pyarrow_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency

Expand All @@ -10,7 +12,10 @@
import pandas as pd
from pandas import DataFrame

from pandas.io._util import _arrow_dtype_mapping
from pandas.io._util import (
_arrow_dtype_mapping,
arrow_string_types_mapper,
)
from pandas.io.parsers.base_parser import ParserBase

if TYPE_CHECKING:
Expand Down Expand Up @@ -215,6 +220,8 @@ def read(self) -> DataFrame:
dtype_mapping = _arrow_dtype_mapping()
dtype_mapping[pa.null()] = pd.Int64Dtype()
frame = table.to_pandas(types_mapper=dtype_mapping.get)
elif using_pyarrow_string_dtype():
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
24 changes: 22 additions & 2 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@
from pandas._config import (
config,
get_option,
using_pyarrow_string_dtype,
)

from pandas._libs import (
lib,
writers as libwriters,
)
from pandas._libs.lib import is_string_array
from pandas._libs.tslibs import timezones
from pandas.compat._optional import import_optional_dependency
from pandas.compat.pickle_compat import patch_pickle
Expand Down Expand Up @@ -66,6 +68,7 @@
)
from pandas.core.dtypes.missing import array_equivalent

import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -3219,7 +3222,12 @@ def read(
self.validate_read(columns, where)
index = self.read_index("index", start=start, stop=stop)
values = self.read_array("values", start=start, stop=stop)
return Series(values, index=index, name=self.name, copy=False)
result = Series(values, index=index, name=self.name, copy=False)
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
import pyarrow as pa

result = result.astype(pd.ArrowDtype(pa.string()))
return result

# error: Signature of "write" incompatible with supertype "Fixed"
def write(self, obj, **kwargs) -> None: # type: ignore[override]
Expand Down Expand Up @@ -3287,6 +3295,10 @@ def read(

columns = items[items.get_indexer(blk_items)]
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
dfs.append(df)

if len(dfs) > 0:
Expand Down Expand Up @@ -4668,7 +4680,15 @@ def read(
else:
# Categorical
df = DataFrame._from_arrays([values], columns=cols_, index=index_)
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"):
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
if using_pyarrow_string_dtype() and is_string_array(
values, # type: ignore[arg-type]
skipna=True,
):
import pyarrow as pa

df = df.astype(pd.ArrowDtype(pa.string()))
frames.append(df)

if len(frames) == 1:
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,15 +547,14 @@ def test_string_inference(all_parsers):

data = """a,b
x,1
y,2"""
y,2
,3"""
parser = all_parsers
if parser.engine == "pyarrow":
pytest.skip("TODO: Follow up")
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data))

expected = DataFrame(
{"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]},
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
columns=pd.Index(["a", "b"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
16 changes: 16 additions & 0 deletions pandas/tests/io/pytables/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,19 @@ def test_read_py2_hdf_file_in_py3(datapath):
) as store:
result = store["p"]
tm.assert_frame_equal(result, expected)


def test_read_infer_string(tmp_path, setup_path):
# GH#54431
pa = pytest.importorskip("pyarrow")
df = DataFrame({"a": ["a", "b", None]})
path = tmp_path / setup_path
df.to_hdf(path, key="data", format="table")
with pd.option_context("future.infer_string", True):
result = read_hdf(path, key="data", mode="r")
expected = DataFrame(
{"a": ["a", "b", None]},
dtype=pd.ArrowDtype(pa.string()),
columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())),
)
tm.assert_frame_equal(result, expected)
14 changes: 14 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,17 @@ def test_invalid_dtype_backend(self):
df.to_feather(path)
with pytest.raises(ValueError, match=msg):
read_feather(path, dtype_backend="numpy")

def test_string_inference(self, tmp_path):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_feather(path)
with pd.option_context("future.infer_string", True):
result = read_feather(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
)
tm.assert_frame_equal(result, expected)
15 changes: 15 additions & 0 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,18 @@ def test_invalid_dtype_backend():
df.to_orc(path)
with pytest.raises(ValueError, match=msg):
read_orc(path, dtype_backend="numpy")


def test_string_inference(tmp_path):
# GH#54431
path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_orc(path)
with pd.option_context("future.infer_string", True):
result = read_orc(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.ArrowDtype(pa.string()),
columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())),
)
tm.assert_frame_equal(result, expected)
16 changes: 16 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,22 @@ def test_df_attrs_persistence(self, tmp_path, pa):
new_df = read_parquet(path, engine=pa)
assert new_df.attrs == df.attrs

def test_string_inference(self, tmp_path, pa):
# GH#54431
import pyarrow as pa

path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
df.to_parquet(path, engine="pyarrow")
with pd.option_context("future.infer_string", True):
result = read_parquet(path, engine="pyarrow")
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.ArrowDtype(pa.string()),
index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
)
tm.assert_frame_equal(result, expected)


class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full):
Expand Down

0 comments on commit 57c7943

Please sign in to comment.