From bbd7a621e6d3c84d05c8cee09ab4f826a949d2c1 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Thu, 21 Nov 2024 18:51:54 -0800 Subject: [PATCH] SNOW-1818205: Add support for pd.json_normalize (#2657) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1818205 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [ ] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://docs.google.com/document/d/162d_i4zZ2AfcGRXojj0jByt8EUq-DrSHPPnTa4QvwbA/edit#bookmark=id.e82u4nekq80k) 3. Please describe how your code solves the related issue. Add support for pd.json_normalize. --- CHANGELOG.md | 1 + .../modin/supported/general_supported.rst | 2 + .../snowpark/modin/plugin/docstrings/io.py | 119 ++++++++++++++++++ .../modin/plugin/extensions/io_overrides.py | 10 +- .../snowpark/modin/plugin/io/snow_io.py | 7 ++ tests/integ/modin/io/test_json_normalize.py | 98 +++++++++++++++ 6 files changed, 235 insertions(+), 2 deletions(-) create mode 100644 tests/integ/modin/io/test_json_normalize.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e85be7b8500..5629d3505db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,7 @@ #### New Features - Added support for `DataFrame.align` and `Series.align` for `axis=1` and `axis=None`. +- Added support fot `pd.json_normalize`. #### Bug Fixes diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst index bcefb9f44af..5c97e72476d 100644 --- a/docs/source/modin/supported/general_supported.rst +++ b/docs/source/modin/supported/general_supported.rst @@ -32,6 +32,8 @@ Data manipulations | ``get_dummies`` | P | ``sparse`` is ignored | ``Y`` if params ``dummy_na``, ``drop_first`` | | | | | and ``dtype`` are default, otherwise ``N`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ +| ``json_normalize`` | Y | | | ++-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``lreshape`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``melt`` | P | ``col_level``, ``ignore_index`` | ``N`` if df.columns is a MultiIndex | diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/io.py b/src/snowflake/snowpark/modin/plugin/docstrings/io.py index c45ba8cfa86..391da898478 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/io.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/io.py @@ -343,6 +343,125 @@ def read_xml(): def json_normalize(): """ Normalize semi-structured JSON data into a flat table. + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects. + record_path : str or list of str, default None + Path in each object to list of records. If not passed, data will be assumed to be an array of records. + meta : list of paths (str or list of str), default None + Fields to use as metadata for each record in resulting table. + meta_prefix : str, default None + If True, prefix records with dotted path, e.g. foo.bar.field if meta is [‘foo’, ‘bar’]. + record_prefix : str, default None + If True, prefix records with dotted path, e.g. foo.bar.field if path to records is [‘foo’, ‘bar’]. + errors : {‘raise’, ‘ignore’}, default ‘raise’ + Configures error handling. + - ‘ignore’ : will ignore KeyError if keys listed in meta are not always present. + - ‘raise’ : will raise KeyError if keys listed in meta are not always present. + sep : str, default ‘.’ + Nested records will generate names separated by sep. e.g., for sep=’.’, {‘foo’: {‘bar’: 0}} -> foo.bar. + max_level : int, default None + Max number of levels(depth of dict) to normalize. if None, normalizes all levels. + + Returns + ------- + frame : DataFrame + Normalize semi-structured JSON data into a flat table. + + Examples + -------- + >>> data = [ + ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + ... {"name": {"given": "Mark", "family": "Regner"}}, + ... {"id": 2, "name": "Faye Raker"}, + ... ] + >>> pd.json_normalize(data) + id name.first name.last name.given name.family name + 0 1.0 Coleen Volk None None None + 1 NaN None None Mark Regner None + 2 2.0 None None None None Faye Raker + + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] + >>> pd.json_normalize(data, max_level=0) + id name fitness + 0 1.0 Cole Volk {'height': 130, 'weight': 60} + 1 NaN Mark Reg {'height': 130, 'weight': 60} + 2 2.0 Faye Raker {'height': 130, 'weight': 60} + + Normalizes nested data up to level 1. + + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] + >>> pd.json_normalize(data, max_level=1) + id name fitness.height fitness.weight + 0 1.0 Cole Volk 130 60 + 1 NaN Mark Reg 130 60 + 2 2.0 Faye Raker 130 60 + + >>> data = [ + ... { + ... "state": "Florida", + ... "shortname": "FL", + ... "info": {"governor": "Rick Scott"}, + ... "counties": [ + ... {"name": "Dade", "population": 12345}, + ... {"name": "Broward", "population": 40000}, + ... {"name": "Palm Beach", "population": 60000}, + ... ], + ... }, + ... { + ... "state": "Ohio", + ... "shortname": "OH", + ... "info": {"governor": "John Kasich"}, + ... "counties": [ + ... {"name": "Summit", "population": 1234}, + ... {"name": "Cuyahoga", "population": 1337}, + ... ], + ... }, + ... ] + >>> result = pd.json_normalize( + ... data, "counties", ["state", "shortname", ["info", "governor"]] + ... ) + >>> result + name population state shortname info.governor + 0 Dade 12345 Florida FL Rick Scott + 1 Broward 40000 Florida FL Rick Scott + 2 Palm Beach 60000 Florida FL Rick Scott + 3 Summit 1234 Ohio OH John Kasich + 4 Cuyahoga 1337 Ohio OH John Kasich + + >>> data = {"A": [1, 2]} + >>> pd.json_normalize(data, "A", record_prefix="Prefix.") + Prefix.0 + 0 1 + 1 2 + + Returns normalized data with columns prefixed with the given string. """ diff --git a/src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py index 64fb72b7338..7b9d94d650f 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py @@ -133,7 +133,6 @@ def read_xml( @_inherit_docstrings(native_pd.json_normalize, apilink="pandas.json_normalize") @register_pd_accessor("json_normalize") -@pandas_module_level_function_not_implemented() def json_normalize( data: dict | list[dict], record_path: str | list | None = None, @@ -146,7 +145,14 @@ def json_normalize( ) -> pd.DataFrame: # noqa: PR01, RT01, D200 # TODO(https://github.com/modin-project/modin/issues/7104): # modin needs to remove defaults to pandas at API layer - pass # pragma: no cover + _pd_json_normalize_signature = { + val.name + for val in inspect.signature(native_pd.json_normalize).parameters.values() + } + _, _, _, f_locals = inspect.getargvalues(inspect.currentframe()) + kwargs = {k: v for k, v in f_locals.items() if k in _pd_json_normalize_signature} + + return pd.DataFrame(query_compiler=PandasOnSnowflakeIO.json_normalize(**kwargs)) @_inherit_docstrings(native_pd.read_orc, apilink="pandas.read_orc") diff --git a/src/snowflake/snowpark/modin/plugin/io/snow_io.py b/src/snowflake/snowpark/modin/plugin/io/snow_io.py index 9c3e9a7621d..61c17d33c53 100644 --- a/src/snowflake/snowpark/modin/plugin/io/snow_io.py +++ b/src/snowflake/snowpark/modin/plugin/io/snow_io.py @@ -177,6 +177,13 @@ def from_pandas(cls, df: pandas.DataFrame): """ return cls.query_compiler_cls.from_pandas(df, pandas.DataFrame) + @classmethod + def json_normalize(cls, **kwargs): # noqa: PR01 + """ + Normalize semi-structured JSON data into a query compiler representing a flat table. + """ + return cls.from_pandas(pandas.json_normalize(**kwargs)) + @classmethod def read_excel(cls, **kwargs): # noqa: PR01 """ diff --git a/tests/integ/modin/io/test_json_normalize.py b/tests/integ/modin/io/test_json_normalize.py new file mode 100644 index 00000000000..2dd13ec3f5b --- /dev/null +++ b/tests/integ/modin/io/test_json_normalize.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# +import modin.pandas as pd +import pandas as native_pd +import pytest + +from tests.integ.modin.utils import assert_frame_equal +from tests.integ.utils.sql_counter import SqlCounter + + +def test_json_normalize_basic(): + data = [ + {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + {"name": {"given": "Mark", "family": "Regner"}}, + {"id": 2, "name": "Faye Raker"}, + ] + + with SqlCounter(query_count=1): + assert_frame_equal( + pd.json_normalize(data), + native_pd.json_normalize(data), + check_dtype=False, + ) + + +@pytest.mark.parametrize("max_level", [0, 1]) +def test_json_normalize_max_level(max_level): + data = [ + { + "id": 1, + "name": "Cole Volk", + "fitness": {"height": 130, "weight": 60}, + }, + {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + { + "id": 2, + "name": "Faye Raker", + "fitness": {"height": 130, "weight": 60}, + }, + ] + + with SqlCounter(query_count=1): + assert_frame_equal( + pd.json_normalize(data=data, max_level=max_level), + native_pd.json_normalize(data=data, max_level=max_level), + check_dtype=False, + ) + + +def test_json_normalize_record_path_meta(): + data = [ + { + "state": "Florida", + "shortname": "FL", + "info": {"governor": "Rick Scott"}, + "counties": [ + {"name": "Dade", "population": 12345}, + {"name": "Broward", "population": 40000}, + {"name": "Palm Beach", "population": 60000}, + ], + }, + { + "state": "Ohio", + "shortname": "OH", + "info": {"governor": "John Kasich"}, + "counties": [ + {"name": "Summit", "population": 1234}, + {"name": "Cuyahoga", "population": 1337}, + ], + }, + ] + + with SqlCounter(query_count=1): + assert_frame_equal( + pd.json_normalize( + data=data, + record_path="counties", + meta=["state", "shortname", ["info", "governor"]], + ), + native_pd.json_normalize( + data=data, + record_path="counties", + meta=["state", "shortname", ["info", "governor"]], + ), + check_dtype=False, + ) + + +def test_json_normalize_record_prefix(): + data = {"A": [1, 2]} + + with SqlCounter(query_count=1): + assert_frame_equal( + pd.json_normalize(data=data, record_prefix="Prefix."), + native_pd.json_normalize(data=data, record_prefix="Prefix."), + check_dtype=False, + )