Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1818205: Add support for pd.json_normalize #2657

Merged
merged 7 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#### New Features

- Added support for `DataFrame.align` and `Series.align` for `axis=1` and `axis=None`.
- Added support fot `pd.json_normalize`.

#### Bug Fixes

Expand Down
2 changes: 2 additions & 0 deletions docs/source/modin/supported/general_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Data manipulations
| ``get_dummies`` | P | ``sparse`` is ignored | ``Y`` if params ``dummy_na``, ``drop_first`` |
| | | | and ``dtype`` are default, otherwise ``N`` |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``json_normalize`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``lreshape`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``melt`` | P | ``col_level``, ``ignore_index`` | ``N`` if df.columns is a MultiIndex |
Expand Down
119 changes: 119 additions & 0 deletions src/snowflake/snowpark/modin/plugin/docstrings/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,125 @@ def read_xml():
def json_normalize():
"""
Normalize semi-structured JSON data into a flat table.

Parameters
----------
data : dict or list of dicts
Unserialized JSON objects.
record_path : str or list of str, default None
Path in each object to list of records. If not passed, data will be assumed to be an array of records.
meta : list of paths (str or list of str), default None
Fields to use as metadata for each record in resulting table.
meta_prefix : str, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if meta is [‘foo’, ‘bar’].
sfc-gh-helmeleegy marked this conversation as resolved.
Show resolved Hide resolved
record_prefix : str, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is [‘foo’, ‘bar’].
errors : {‘raise’, ‘ignore’}, default ‘raise’
Configures error handling.
- ‘ignore’ : will ignore KeyError if keys listed in meta are not always present.
- ‘raise’ : will raise KeyError if keys listed in meta are not always present.
sep : str, default ‘.’
Nested records will generate names separated by sep. e.g., for sep=’.’, {‘foo’: {‘bar’: 0}} -> foo.bar.
max_level : int, default None
Max number of levels(depth of dict) to normalize. if None, normalizes all levels.

Returns
-------
frame : DataFrame
sfc-gh-helmeleegy marked this conversation as resolved.
Show resolved Hide resolved
Normalize semi-structured JSON data into a flat table.

Examples
--------
>>> data = [
... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
... {"name": {"given": "Mark", "family": "Regner"}},
... {"id": 2, "name": "Faye Raker"},
... ]
>>> pd.json_normalize(data)
id name.first name.last name.given name.family name
0 1.0 Coleen Volk NaN NaN NaN
1 NaN NaN NaN Mark Regner NaN
2 2.0 NaN NaN NaN NaN Faye Raker

>>> data = [
... {
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> pd.json_normalize(data, max_level=0)
id name fitness
0 1.0 Cole Volk {'height': 130, 'weight': 60}
1 NaN Mark Reg {'height': 130, 'weight': 60}
2 2.0 Faye Raker {'height': 130, 'weight': 60}

Normalizes nested data up to level 1.

>>> data = [
... {
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> pd.json_normalize(data, max_level=1)
id name fitness.height fitness.weight
0 1.0 Cole Volk 130 60
1 NaN Mark Reg 130 60
2 2.0 Faye Raker 130 60

>>> data = [
... {
... "state": "Florida",
... "shortname": "FL",
... "info": {"governor": "Rick Scott"},
... "counties": [
... {"name": "Dade", "population": 12345},
... {"name": "Broward", "population": 40000},
... {"name": "Palm Beach", "population": 60000},
... ],
... },
... {
... "state": "Ohio",
... "shortname": "OH",
... "info": {"governor": "John Kasich"},
... "counties": [
... {"name": "Summit", "population": 1234},
... {"name": "Cuyahoga", "population": 1337},
... ],
... },
... ]
>>> result = pd.json_normalize(
... data, "counties", ["state", "shortname", ["info", "governor"]]
... )
>>> result
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich

>>> data = {"A": [1, 2]}
>>> pd.json_normalize(data, "A", record_prefix="Prefix.")
Prefix.0
0 1
1 2

Returns normalized data with columns prefixed with the given string.
"""


Expand Down
10 changes: 8 additions & 2 deletions src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def read_xml(

@_inherit_docstrings(native_pd.json_normalize, apilink="pandas.json_normalize")
@register_pd_accessor("json_normalize")
@pandas_module_level_function_not_implemented()
def json_normalize(
data: dict | list[dict],
record_path: str | list | None = None,
Expand All @@ -146,7 +145,14 @@ def json_normalize(
) -> pd.DataFrame: # noqa: PR01, RT01, D200
sfc-gh-helmeleegy marked this conversation as resolved.
Show resolved Hide resolved
# TODO(https://github.com/modin-project/modin/issues/7104):
# modin needs to remove defaults to pandas at API layer
pass # pragma: no cover
_pd_json_normalize_signature = {
val.name
for val in inspect.signature(native_pd.json_normalize).parameters.values()
}
_, _, _, f_locals = inspect.getargvalues(inspect.currentframe())
sfc-gh-nkrishna marked this conversation as resolved.
Show resolved Hide resolved
kwargs = {k: v for k, v in f_locals.items() if k in _pd_json_normalize_signature}

return pd.DataFrame(query_compiler=PandasOnSnowflakeIO.json_normalize(**kwargs))


@_inherit_docstrings(native_pd.read_orc, apilink="pandas.read_orc")
Expand Down
7 changes: 7 additions & 0 deletions src/snowflake/snowpark/modin/plugin/io/snow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,13 @@ def from_pandas(cls, df: pandas.DataFrame):
"""
return cls.query_compiler_cls.from_pandas(df, pandas.DataFrame)

@classmethod
def json_normalize(cls, **kwargs): # noqa: PR01
"""
Normalize semi-structured JSON data into a query compiler representing a flat table.
"""
return cls.from_pandas(pandas.json_normalize(**kwargs))
sfc-gh-joshi marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def read_excel(cls, **kwargs): # noqa: PR01
"""
Expand Down
98 changes: 98 additions & 0 deletions tests/integ/modin/io/test_json_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import modin.pandas as pd
import pandas as native_pd
import pytest

from tests.integ.modin.utils import assert_frame_equal
from tests.integ.utils.sql_counter import SqlCounter


def test_json_normalize_basic():
data = [
{"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
{"name": {"given": "Mark", "family": "Regner"}},
{"id": 2, "name": "Faye Raker"},
]

with SqlCounter(query_count=1):
assert_frame_equal(
pd.json_normalize(data),
native_pd.json_normalize(data),
check_dtype=False,
)


@pytest.mark.parametrize("max_level", [0, 1])
def test_json_normalize_max_level(max_level):
data = [
{
"id": 1,
"name": "Cole Volk",
"fitness": {"height": 130, "weight": 60},
},
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
{
"id": 2,
"name": "Faye Raker",
"fitness": {"height": 130, "weight": 60},
},
]

with SqlCounter(query_count=1):
assert_frame_equal(
pd.json_normalize(data=data, max_level=max_level),
native_pd.json_normalize(data=data, max_level=max_level),
check_dtype=False,
)


def test_json_normalize_record_path_meta():
data = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]

with SqlCounter(query_count=1):
assert_frame_equal(
pd.json_normalize(
data=data,
record_path="counties",
meta=["state", "shortname", ["info", "governor"]],
),
native_pd.json_normalize(
data=data,
record_path="counties",
meta=["state", "shortname", ["info", "governor"]],
),
check_dtype=False,
)


def test_json_normalize_record_prefix():
data = {"A": [1, 2]}

with SqlCounter(query_count=1):
assert_frame_equal(
pd.json_normalize(data=data, record_prefix="Prefix."),
native_pd.json_normalize(data=data, record_prefix="Prefix."),
check_dtype=False,
)
Loading