diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index 16ada4d9ad4..9581959f2be 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -173,6 +173,68 @@ jobs: .tox/.coverage .tox/coverage.xml + test-pandas-patch-versions: + name: Test Snowpark pandas with pandas ${{ matrix.pandas-version }} + needs: build + runs-on: ${{ matrix.os.image_name }} + strategy: + fail-fast: false + matrix: + os: + - image_name: ubuntu-latest-64-cores + download_name: linux + pandas-version: ["2.2.1", "2.2.2"] + python-version: ["3.9"] + cloud-provider: [aws] + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Display Python version + run: python -c "import sys; print(sys.version)" + - name: Decrypt parameters.py + shell: bash + run: .github/scripts/decrypt_parameters.sh + env: + PARAMETER_PASSWORD: ${{ secrets.PARAMETER_PASSWORD }} + CLOUD_PROVIDER: ${{ matrix.cloud-provider }} + - name: Download wheel(s) + uses: actions/download-artifact@v4 + with: + name: wheel + path: dist + - name: Show wheels downloaded + run: ls -lh dist + shell: bash + - name: Upgrade setuptools, pip and wheel + run: python -m pip install -U setuptools pip wheel + - name: Install tox + run: python -m pip install tox + - if: ${{ contains('macos', matrix.os.download_name) }} + name: Run Snowpark pandas API doctests + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" + env: + MODIN_PANDAS_PATCH_VERSION: ${{ matrix.pandas-version }} + PYTHON_VERSION: ${{ matrix.python-version }} + cloud_provider: ${{ matrix.cloud-provider }} + PYTEST_ADDOPTS: --color=yes --tb=short + TOX_PARALLEL_NO_SPINNER: 1 + # Specify SNOWFLAKE_IS_PYTHON_RUNTIME_TEST: 1 when adding >= python3.11 with no server-side support + # For example, see https://github.com/snowflakedb/snowpark-python/pull/681 + shell: bash + - name: Run Snowpark pandas API tests (excluding doctests) + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci" + env: + MODIN_PANDAS_PATCH_VERSION: ${{ matrix.pandas-version }} + PYTHON_VERSION: ${{ matrix.python-version }} + cloud_provider: ${{ matrix.cloud-provider }} + PYTEST_ADDOPTS: --color=yes --tb=short + TOX_PARALLEL_NO_SPINNER: 1 + shell: bash + test-disable-sql-simplifier: # Will be removed after sql simplifier is stable and no option to opt out. name: Test Disable SQL Simplifier modin-${{ matrix.os.download_name }}-${{ matrix.python-version }}-${{ matrix.cloud-provider }} needs: build diff --git a/CHANGELOG.md b/CHANGELOG.md index c84192536b2..c6af0ccad4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,11 @@ ### Snowpark pandas API Updates +### Dependency Updates + +- Updated `modin` from 0.28.1 to 0.30.1. +- Added support for all `pandas` 2.2.x versions. + #### New Features - Added support for `np.subtract`, `np.multiply`, `np.divide`, and `np.true_divide`. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 66def5ea34d..6e54296e9d9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,7 +35,7 @@ cd snowpark-python - Create a new Python virtual environment with any Python version that we support. - The Snowpark Python API supports **Python 3.8, Python 3.9, Python 3.10, and Python 3.11**. - - The Snowpark pandas API supports **Python 3.9, Python 3.10, and Python 3.11**. Additionally, Snowpark pandas requires **Modin 0.28.1** and **pandas 2.2.1**. + - The Snowpark pandas API supports **Python 3.9, Python 3.10, and Python 3.11**. Additionally, Snowpark pandas requires **Modin 0.30.1** and **pandas 2.2.x**. ```bash conda create --name snowpark-dev python=3.9 diff --git a/setup.py b/setup.py index fc50573905f..cf02daf1902 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,7 @@ THIS_DIR = os.path.dirname(os.path.realpath(__file__)) SRC_DIR = os.path.join(THIS_DIR, "src") SNOWPARK_SRC_DIR = os.path.join(SRC_DIR, "snowflake", "snowpark") -MODIN_DEPENDENCY_VERSION = ( - "==0.28.1" # Snowpark pandas requires modin 0.28.1, which depends on pandas 2.2.1 -) +MODIN_DEPENDENCY_VERSION = "==0.30.1" # Snowpark pandas requires modin 0.30.1, which is compatible with pandas 2.2.x CONNECTOR_DEPENDENCY_VERSION = ">=3.12.0, <4.0.0" CONNECTOR_DEPENDENCY = f"snowflake-connector-python{CONNECTOR_DEPENDENCY_VERSION}" INSTALL_REQ_LIST = [ diff --git a/src/snowflake/snowpark/modin/plugin/__init__.py b/src/snowflake/snowpark/modin/plugin/__init__.py index 2a277bde92f..2e9fbe721f0 100644 --- a/src/snowflake/snowpark/modin/plugin/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/__init__.py @@ -21,11 +21,17 @@ # since modin may raise its own warnings/errors on the wrong pandas version import pandas # isort: skip # noqa: E402 -supported_pandas_version = "2.2.1" -if pandas.__version__ != supported_pandas_version: +# TODO SNOW-1758773: perform version check in modin instead +supported_pandas_major_version = 2 +supported_pandas_minor_version = 2 +actual_pandas_version = version.parse(pandas.__version__) +if ( + actual_pandas_version.major != supported_pandas_major_version + and actual_pandas_version.minor != supported_pandas_minor_version +): raise RuntimeError( f"The pandas version installed ({pandas.__version__}) does not match the supported pandas version in" - + f" Snowpark pandas ({supported_pandas_version}). " + + f" Snowpark pandas ({supported_pandas_major_version}.{supported_pandas_minor_version}.x). " + install_msg ) # pragma: no cover @@ -36,7 +42,7 @@ "Modin is not installed. " + install_msg ) # pragma: no cover -supported_modin_version = "0.28.1" +supported_modin_version = "0.30.1" if version.parse(modin.__version__) != version.parse(supported_modin_version): raise ImportError( f"The Modin version installed ({modin.__version__}) does not match the supported Modin version in" @@ -136,6 +142,7 @@ register_pd_accessor, register_series_accessor, ) +from modin.pandas.accessor import ModinAPI # isort: skip # noqa: E402,F401 from snowflake.snowpark.modin.plugin._internal.telemetry import ( # isort: skip # noqa: E402,F401 TELEMETRY_PRIVATE_METHODS, @@ -143,10 +150,26 @@ try_add_telemetry_to_attribute, ) +# Add telemetry on the ModinAPI accessor object. +# modin 0.30.1 introduces the pd.DataFrame.modin accessor object for non-pandas methods, +# such as pd.DataFrame.modin.to_pandas and pd.DataFrame.modin.to_ray. We will automatically +# raise NotImplementedError for all methods on this accessor object except to_pandas, but +# we still want to record telemetry. +for attr_name in dir(ModinAPI): + if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + setattr( + ModinAPI, + attr_name, + try_add_telemetry_to_attribute(attr_name, getattr(ModinAPI, attr_name)), + ) + for attr_name in dir(Series): # Since Series is defined in upstream Modin, all of its members were either defined upstream # or overridden by extension. - if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + # Skip the `modin` accessor object, since we apply telemetry to all its fields. + if attr_name != "modin" and ( + not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS + ): register_series_accessor(attr_name)( try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name)) ) @@ -154,7 +177,10 @@ for attr_name in dir(DataFrame): # Since DataFrame is defined in upstream Modin, all of its members were either defined upstream # or overridden by extension. - if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + # Skip the `modin` accessor object, since we apply telemetry to all its fields. + if attr_name != "modin" and ( + not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS + ): register_dataframe_accessor(attr_name)( try_add_telemetry_to_attribute(attr_name, getattr(DataFrame, attr_name)) ) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 166143ed1e8..c5f766893fa 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -713,6 +713,7 @@ def aggregate(): agg = aggregate def apply(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Apply a function along an axis of the DataFrame. @@ -821,7 +822,7 @@ def apply(): Using a reducing function on ``axis=1``: - >>> df.apply(np.sum, axis=1) + >>> df.apply(np.sum, axis=1) # doctest: +SKIP 0 2 1 10 2 13 @@ -829,7 +830,7 @@ def apply(): Returning a list-like object will result in a Series: - >>> df.apply(lambda x: [1, 2], axis=1) + >>> df.apply(lambda x: [1, 2], axis=1) # doctest: +SKIP 0 [1, 2] 1 [1, 2] 2 [1, 2] @@ -1022,6 +1023,7 @@ def keys(): """ def transform(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Call ``func`` on self producing a Snowpark pandas DataFrame with the same axis shape as self. @@ -1055,7 +1057,7 @@ def transform(): 0 1 3 1 2 4 2 3 5 - >>> df.transform(lambda x: x + 1, axis=1) + >>> df.transform(lambda x: x + 1, axis=1) # doctest: +SKIP col1 col2 0 2 4 1 3 5 @@ -1063,7 +1065,7 @@ def transform(): Apply a numpy ufunc to every value in the DataFrame. - >>> df.transform(np.square, axis=1) + >>> df.transform(np.square, axis=1) # doctest: +SKIP col1 col2 0 1 9 1 4 16 diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index f9260ddb0a1..0dbdced47c2 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -989,6 +989,7 @@ def cummax(): """ def apply(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Apply function ``func`` group-wise and combine the results together. @@ -1050,7 +1051,7 @@ def apply(): its argument and returns a DataFrame. `apply` combines the result for each group together into a new DataFrame: - >>> g1[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +NORMALIZE_WHITESPACE + >>> g1[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +SKIP B C 0.0 0.333333 0.4 1.0 0.666667 0.6 @@ -1059,7 +1060,7 @@ def apply(): In the above, the groups are not part of the index. We can have them included by using ``g2`` where ``group_keys=True``: - >>> g2[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +NORMALIZE_WHITESPACE + >>> g2[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +SKIP B C A a 0.0 0.333333 0.4 @@ -1942,6 +1943,7 @@ def cov(): pass def transform(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Call function producing a same-indexed DataFrame on each group. @@ -2011,7 +2013,7 @@ def transform(): i X 9 90 -9 j Y 10 10 -10 - >>> df.groupby("col1", dropna=True).transform(lambda df, n: df.head(n), n=2) + >>> df.groupby("col1", dropna=True).transform(lambda df, n: df.head(n), n=2) # doctest: +SKIP col2 col3 col4 a 1.0 40.0 -1.0 b NaN NaN NaN @@ -2024,7 +2026,7 @@ def transform(): i NaN NaN NaN j 10.0 10.0 -10.0 - >>> df.groupby("col1", dropna=False).transform("mean") + >>> df.groupby("col1", dropna=False).transform("mean") # doctest: +SKIP col2 col3 col4 a 2.50 25.0 -2.50 b 5.00 65.0 -5.00 diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py index e2633f7eeb8..a472e131370 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py @@ -1552,6 +1552,37 @@ def __getitem__(self, key): return self.loc[:, key] +# Modin uses the unique() query compiler method instead of aliasing the duplicated frontend method as of 0.30.1. +# TODO SNOW-1758721: use the more efficient implementation +@register_base_override("drop_duplicates") +def drop_duplicates( + self, keep="first", inplace=False, **kwargs +): # noqa: PR01, RT01, D200 + """ + Return `BasePandasDataset` with duplicate rows removed. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = kwargs.get("ignore_index", False) + subset = kwargs.get("subset", None) + if subset is not None: + if is_list_like(subset): + if not isinstance(subset, list): + subset = list(subset) # pragma: no cover + else: + subset = [subset] + df = self[subset] + else: + df = self + duplicated = df.duplicated(keep=keep) + result = self[~duplicated] + if ignore_index: + result.index = pandas.RangeIndex(stop=len(result)) + if inplace: + self._update_inplace(result._query_compiler) # pragma: no cover + else: + return result + + # Snowpark pandas does extra argument validation, which may need to be upstreamed. @register_base_override("sort_values") def sort_values( diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index d8803cfda0c..4e8ea15f83d 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -1480,46 +1480,6 @@ def mask( ) -# Snowpark pandas has a fix for a pandas behavior change. It is available in Modin 0.30.1 (SNOW-1552497). -@register_dataframe_accessor("melt") -def melt( - self, - id_vars=None, - value_vars=None, - var_name=None, - value_name="value", - col_level=None, - ignore_index=True, -): # noqa: PR01, RT01, D200 - """ - Unpivot a ``DataFrame`` from wide to long format, optionally leaving identifiers set. - """ - # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions - if id_vars is None: - id_vars = [] - if not is_list_like(id_vars): - id_vars = [id_vars] - if value_vars is None: - # Behavior of Index.difference changed in 2.2.x - # https://github.com/pandas-dev/pandas/pull/55113 - # This change needs upstream to Modin: - # https://github.com/modin-project/modin/issues/7206 - value_vars = self.columns.drop(id_vars) - if var_name is None: - columns_name = self._query_compiler.get_index_name(axis=1) - var_name = columns_name if columns_name is not None else "variable" - return self.__constructor__( - query_compiler=self._query_compiler.melt( - id_vars=id_vars, - value_vars=value_vars, - var_name=var_name, - value_name=value_name, - col_level=col_level, - ignore_index=ignore_index, - ) - ) - - # Snowpark pandas does more thorough error checking. @register_dataframe_accessor("merge") def merge( diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 0c531cc4f58..ce9266d0e1f 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -24,6 +24,7 @@ from pandas._typing import ( AggFuncType, AnyArrayLike, + ArrayLike, Axis, FillnaOptions, IgnoreRaise, @@ -1596,6 +1597,19 @@ def rename( return self_cp +# In some cases, modin after 0.30.1 returns a DatetimeArray instead of a numpy array. This +# still differs from the expected pandas behavior, which would return DatetimeIndex +# (see SNOW-1019312). +@register_series_accessor("unique") +def unique(self) -> ArrayLike: # noqa: RT01, D200 + """ + Return unique values of Series object. + """ + # `values` can't be used here because it performs unnecessary conversion, + # after which the result type does not match the pandas + return self.__constructor__(query_compiler=self._query_compiler.unique()).to_numpy() + + # Modin defaults to pandas for some arguments for unstack @register_series_accessor("unstack") def unstack( diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py index e76c3f9e28b..eca569aa99d 100644 --- a/tests/integ/modin/frame/test_apply.py +++ b/tests/integ/modin/frame/test_apply.py @@ -23,6 +23,7 @@ from snowflake.snowpark.types import DoubleType, PandasSeriesType from tests.integ.modin.series.test_apply import create_func_with_return_type_hint from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, @@ -30,6 +31,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + # TODO SNOW-891796: replace native_pd with pd after allowing using snowpandas module/function in UDF # test data which has a python type as return type that is not a pandas Series/pandas DataFrame/tuple/list diff --git a/tests/integ/modin/frame/test_apply_axis_0.py b/tests/integ/modin/frame/test_apply_axis_0.py index 2edafc6b830..f5b8eaac10a 100644 --- a/tests/integ/modin/frame/test_apply_axis_0.py +++ b/tests/integ/modin/frame/test_apply_axis_0.py @@ -3,17 +3,20 @@ # import datetime +import re import modin.pandas as pd import numpy as np import pandas as native_pd import pytest +from packaging.version import Version from pytest import param import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.series.test_apply import create_func_with_return_type_hint from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, @@ -21,6 +24,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + # test data which has a python type as return type that is not a pandas Series/pandas DataFrame/tuple/list BASIC_DATA_FUNC_PYTHON_RETURN_TYPE_MAP = [ [[[1.0, 2.2], [3, np.nan]], np.min, "float"], @@ -220,9 +228,13 @@ def test_axis_0_return_dataframe_not_supported(): # Note that pands returns failure "ValueError: If using all scalar values, you must pass an index" which # doesn't explain this isn't supported. We go with the default returned by pandas in this case. - with pytest.raises( - SnowparkSQLException, match="The truth value of a DataFrame is ambiguous." - ): + if Version(native_pd.__version__) > Version("2.2.1"): + expected_message = re.escape( + "Data must be 1-dimensional, got ndarray of shape (2, 1) instead" + ) + else: + expected_message = "The truth value of a DataFrame is ambiguous." + with pytest.raises(SnowparkSQLException, match=expected_message): # return value snow_df.apply(lambda x: native_pd.DataFrame([1, 2]), axis=0).to_pandas() diff --git a/tests/integ/modin/frame/test_applymap.py b/tests/integ/modin/frame/test_applymap.py index 91b69c51427..e24076401dc 100644 --- a/tests/integ/modin/frame/test_applymap.py +++ b/tests/integ/modin/frame/test_applymap.py @@ -15,12 +15,18 @@ create_func_with_return_type_hint, ) from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, create_test_dfs, eval_snowpark_pandas_result, ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + @pytest.mark.parametrize("data,func,return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP) @sql_count_checker(query_count=7, udf_count=1) diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index 18283441858..27152c0f34a 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -11,6 +11,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, ) @@ -176,6 +177,10 @@ def test_cache_result_post_pivot(self, inplace, simple_test_data): cached_snow_df, native_df ) + @pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", + ) def test_cache_result_post_apply(self, inplace, simple_test_data): # In this test, the caching doesn't aid in the query counts since # the implementation of apply(axis=1) itself contains intermediate @@ -204,6 +209,10 @@ def test_cache_result_post_apply(self, inplace, simple_test_data): native_df, ) + @pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", + ) def test_cache_result_post_applymap(self, inplace, simple_test_data): # The high query counts in this test case come from the setup and definition # of the UDFs used. diff --git a/tests/integ/modin/groupby/test_all_any.py b/tests/integ/modin/groupby/test_all_any.py index a13712d9a8e..6e1d6513a97 100644 --- a/tests/integ/modin/groupby/test_all_any.py +++ b/tests/integ/modin/groupby/test_all_any.py @@ -14,6 +14,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_frame_equal, create_test_dfs, eval_snowpark_pandas_result as _eval_snowpark_pandas_result, @@ -98,6 +99,10 @@ def test_all_any_invalid_types(data, msg): pd.DataFrame(data).groupby("by").any().to_pandas() +@pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) @sql_count_checker(query_count=5, join_count=1, udtf_count=1) def test_all_any_chained(): data = { diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index c6c805a0ca3..7905842ee17 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -18,6 +18,7 @@ from snowflake.snowpark.exceptions import SnowparkSQLException from snowflake.snowpark.modin.plugin.extensions.utils import try_convert_index_to_native from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, assert_values_equal, @@ -27,6 +28,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + # Use the workaround shown below for applying functions that are attributes # of this module. # https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py index e6fc4ca87bd..a3d730776e9 100644 --- a/tests/integ/modin/groupby/test_groupby_transform.py +++ b/tests/integ/modin/groupby/test_groupby_transform.py @@ -10,11 +10,17 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, create_test_dfs, eval_snowpark_pandas_result as _eval_snowpark_pandas_result, ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + def eval_snowpark_pandas_result(*args, **kwargs): # Some calls to the native pandas function propagate attrs while some do not, depending on the values of its arguments. diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 105d6d15527..5ff3d8b3d31 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -7,6 +7,7 @@ import numpy as np import pandas as native_pd import pytest +from packaging.version import Version import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import assert_frame_equal, assert_series_equal @@ -1337,13 +1338,16 @@ def test_create_series_with_df_index_negative(): @sql_count_checker(query_count=0) def test_create_series_with_df_data_negative(): - with pytest.raises( - ValueError, - match=re.escape( + if Version(native_pd.__version__) > Version("2.2.1"): + expected_message = re.escape( + "Data must be 1-dimensional, got ndarray of shape (3, 2) instead" + ) + else: + expected_message = re.escape( "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool()" ", a.item(), a.any() or a.all()." - ), - ): + ) + with pytest.raises(ValueError, match=expected_message): native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) with pytest.raises(ValueError, match="Data cannot be a DataFrame"): pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) diff --git a/tests/integ/modin/series/test_apply.py b/tests/integ/modin/series/test_apply.py index 5ab9ea486fb..b3c2ec98156 100644 --- a/tests/integ/modin/series/test_apply.py +++ b/tests/integ/modin/series/test_apply.py @@ -21,6 +21,7 @@ from snowflake.snowpark.functions import udf from snowflake.snowpark.types import DoubleType, StringType, VariantType from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, ColumnSchema, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, @@ -30,6 +31,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + BASIC_DATA_FUNC_RETURN_TYPE_MAP = [ ([1, 2, 3, None], lambda x: x + 1, "int"), param( diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index ec6d6c2e580..8387c759350 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -4,19 +4,31 @@ # import modin.pandas as pd +import pandas as native_pd +import pytest +from packaging import version from snowflake.snowpark import Session from snowflake.snowpark.functions import sproc -from snowflake.snowpark.modin.plugin import ( - supported_modin_version, - supported_pandas_version, -) from tests.integ.utils.sql_counter import sql_count_checker from tests.utils import multithreaded_run +pytestmark = pytest.mark.skipif( + version.parse(native_pd.__version__) != version.parse("2.2.1"), + reason="SNOW-1758760: modin stored procedure test must pin pandas==2.2.1 and modin==0.28.1", +) + +# Must pin modin version to match version available in Snowflake Anaconda +SPROC_MODIN_VERSION = "0.28.1" + PACKAGE_LIST = [ - f"pandas=={supported_pandas_version}", - f"modin=={supported_modin_version}", + # modin 0.30.1 supports any pandas 2.2.x, so just pick whichever one is installed in the client. + # Note that because we specify `snowflake-snowpark-python` as a package here, it will pick whatever + # version of the package is available in anaconda, not the latest `main` branch. + # The behavior of stored procedures with `main` is verified in server-side tests and the stored + # procedure Jenkins job. + f"pandas=={native_pd.__version__}", + f"modin=={SPROC_MODIN_VERSION}", "snowflake-snowpark-python", "numpy", ] diff --git a/tests/integ/modin/test_session.py b/tests/integ/modin/test_session.py index 93b4cecb6e7..b624dd3fc60 100644 --- a/tests/integ/modin/test_session.py +++ b/tests/integ/modin/test_session.py @@ -16,7 +16,11 @@ _get_active_sessions, _remove_session, ) -from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, + create_test_dfs, + eval_snowpark_pandas_result, +) from tests.integ.utils.sql_counter import sql_count_checker from tests.utils import running_on_jenkins, running_on_public_ci @@ -212,6 +216,10 @@ def test_snowpark_pandas_session_class_does_not_exist_snow_1022098(): pd.Session +@pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) @pytest.mark.parametrize( "operation", [ diff --git a/tests/integ/modin/test_sql_counter.py b/tests/integ/modin/test_sql_counter.py index a1d7f9e61d8..cffd8267351 100644 --- a/tests/integ/modin/test_sql_counter.py +++ b/tests/integ/modin/test_sql_counter.py @@ -8,7 +8,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark import QueryRecord -from tests.integ.modin.utils import assert_frame_equal +from tests.integ.modin.utils import PANDAS_VERSION_PREDICATE, assert_frame_equal from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker @@ -126,6 +126,10 @@ def test_sql_counter_with_fallback_count(): assert len(df) == 3 +@pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) @sql_count_checker(query_count=5, join_count=2, udtf_count=1) def test_sql_counter_with_df_udtf_count(): df = pd.DataFrame([[1, 2], [3, 4]]).apply(lambda x: str(type(x)), axis=1, raw=True) diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index d4dd5dd96fa..f714a268e52 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -15,6 +15,7 @@ import pandas.testing as tm import pytest from modin.pandas import DataFrame, Index, Series +from packaging import version from pandas import isna from pandas._typing import Scalar from pandas.core.dtypes.common import is_list_like @@ -27,6 +28,10 @@ from snowflake.snowpark.session import Session from snowflake.snowpark.types import StructField, StructType +PANDAS_VERSION_PREDICATE = version.parse(native_pd.__version__) >= version.parse( + "2.2.3" +) + ValuesEqualType = Optional[ Union[ Scalar, diff --git a/tests/integ/test_df_to_snowpark_pandas.py b/tests/integ/test_df_to_snowpark_pandas.py index ede9b10e85c..9e5ad8a7f05 100644 --- a/tests/integ/test_df_to_snowpark_pandas.py +++ b/tests/integ/test_df_to_snowpark_pandas.py @@ -43,21 +43,19 @@ def test_to_snowpark_pandas_no_modin(session, tmp_table_basic): try: import modin # noqa: F401 except ModuleNotFoundError: - # Current Snowpark Python installs pandas==2.2.2, but Snowpark pandas depends on modin - # 0.28.1, which needs pandas==2.2.1. The pandas version check is currently performed - # before Snowpark pandas checks whether modin is installed. - # TODO: SNOW-1552497: after upgrading to modin 0.30.1, Snowpark pandas will support - # all pandas 2.2.x, and this function call will raise a ModuleNotFoundError since - # modin is not installed. - match = ( - "Snowpark pandas does not support Python 3.8. Please update to Python 3.9 or later" - if sys.version_info.major == 3 and sys.version_info.minor == 8 - else "does not match the supported pandas version in Snowpark pandas" - ) - with pytest.raises( - RuntimeError, - match=match, - ): + if sys.version_info.major == 3 and sys.version_info.minor == 8: + # Snowpark pandas does not support Python 3.8 + ctx = pytest.raises( + RuntimeError, + match="Snowpark pandas does not support Python 3.8. Please update to Python 3.9 or later", + ) + else: + # This function call will raise a ModuleNotFoundError since modin is not installed + ctx = pytest.raises( + ModuleNotFoundError, + match="Modin is not installed.", + ) + with ctx: snowpark_df.to_snowpark_pandas() else: snowpark_df.to_snowpark_pandas() # should have no errors diff --git a/tox.ini b/tox.ini index 10d08fb7d30..f4348830878 100644 --- a/tox.ini +++ b/tox.ini @@ -42,6 +42,7 @@ deps = .[development] .[opentelemetry] {env:SNOWFLAKE_PYTEST_MODIN_DEPS} + {env:SNOWFLAKE_PYTEST_PANDAS_DEPS} install_command = bash ./scripts/tox_install_cmd.sh {opts} {packages} setenv = COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} @@ -66,6 +67,7 @@ setenv = SNOWFLAKE_PYTEST_DAILY_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_DAILY_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources {env:SNOWFLAKE_PYTEST_IGNORE_MODIN_CMD} # This configures the extra dependency required by modin test modin: SNOWFLAKE_PYTEST_MODIN_DEPS = [modin-development] + modin_pandas_version: SNOWFLAKE_PYTEST_PANDAS_DEPS = pandas=={env:MODIN_PANDAS_PATCH_VERSION} SNOW_1314507_WORKAROUND_RERUN_FLAGS = --reruns 5 --reruns-delay 1 --only-rerun "Insufficient resource during interleaved execution." MODIN_PYTEST_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources MODIN_PYTEST_DAILY_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_DAILY_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources