kedro-org · MatthiasRoels · Sep 21, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 27, 2023
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
@@ -6,7 +6,7 @@
 PANDAS = "pandas>=1.3, <3.0"
 SPARK = "pyspark>=2.2, <4.0"
 HDFS = "hdfs>=2.5.8, <3.0"
-S3FS = "s3fs>=0.3.0, <0.5"
+S3FS = "s3fs>=2021.4, <2024.1"  # Upper bound set arbitrarily, to be reassessed in early 2024
 POLARS = "polars>=0.18.0"
 DELTA = "delta-spark~=1.2.1"
 
@@ -184,8 +184,7 @@ def _collect_requirements(requires):
     "matplotlib>=3.0.3, <3.4; python_version < '3.10'",  # 3.4.0 breaks holoviews
     "matplotlib>=3.5, <3.6; python_version >= '3.10'",
     "memory_profiler>=0.50.0, <1.0",
-    "moto==1.3.7; python_version < '3.10'",
-    "moto==4.1.12; python_version >= '3.10'",
+    "moto[server]==4.2.4",
     "networkx~=2.4",
     "opencv-python~=4.5.5.64",
     "openpyxl>=3.0.3, <4.0",
@@ -209,8 +208,8 @@ def _collect_requirements(requires):
     "redis~=4.1",
     "requests-mock~=1.6",
     "requests~=2.20",
+    "s3fs>=2021.04, <2024.1",
     "ruff~=0.0.290",
-    "s3fs>=0.3.0, <0.5",  # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem.
     "snowflake-snowpark-python~=1.0; python_version == '3.9'",
     "scikit-learn>=1.0.2,<2",
     "scipy>=1.7.3",

diff --git a/kedro-datasets/tests/conftest.py b/kedro-datasets/tests/conftest.py
@@ -4,10 +4,19 @@
 discover them automatically. More info here:
 https://docs.pytest.org/en/latest/fixture.html
 """
+import json
+import os
 
+import requests
 from kedro.io.core import generate_timestamp
+from moto.moto_server.threaded_moto_server import ThreadedMotoServer
 from pytest import fixture
 
+BUCKET_NAME = "test_bucket"
+IP_ADDRESS = "127.0.0.1"
+PORT = 5555
+ENDPOINT_URI = f"http://{IP_ADDRESS}:{PORT}/"
+
 
 @fixture(params=[None])
 def load_version(request):
@@ -32,3 +41,87 @@ def save_args(request):
 @fixture(params=[None])
 def fs_args(request):
     return request.param
+
+
+@fixture(params=[None])
+def mock_fs_args(request):
+    fs_args = {
+        # NB: use moto server to mock S3
+        "client_kwargs": {"endpoint_url": ENDPOINT_URI}
+    }
+
+    if isinstance(request.param, dict):
+        fs_args.update(request.param)
+
+    return fs_args
+
+
+@fixture
+def credentials():
+    return {
+        "key": "fake_access_key",
+        "secret": "fake_secret_key",
+    }
+
+
+@fixture(scope="session")
+def moto_server():
+    # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests
+    server = ThreadedMotoServer(ip_address=IP_ADDRESS, port=PORT)
+    server.start()
+
+    if "AWS_SECRET_ACCESS_KEY" not in os.environ:
+        os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_access_key"
+    if "AWS_ACCESS_KEY_ID" not in os.environ:
+        os.environ["AWS_ACCESS_KEY_ID"] = "fake_secret_key"
+
+    yield
+
+    server.stop()
+
+
+def _reset_moto_server():
+    # We reuse the MotoServer for all S3 related tests
+    # But we do want a clean state for every test
+    requests.post(f"{ENDPOINT_URI}/moto-api/reset", timeout=2.0)
+
+
+def _get_boto3_client():
+    from botocore.session import Session  # pylint: disable=import-outside-toplevel
+
+    # NB: we use the sync botocore client for setup
+    session = Session()
+    return session.create_client(service_name="s3", endpoint_url=ENDPOINT_URI)
+
+
+@fixture
+def mocked_s3_bucket(moto_server):  # pylint: disable=unused-argument
+    """Create a bucket for testing using moto."""
+    _reset_moto_server()
+    client = _get_boto3_client()
+    client.create_bucket(Bucket=BUCKET_NAME)
+    yield client
+
+
+@fixture
+def mocked_encrypted_s3_bucket(moto_server):  # pylint: disable=unused-argument
+    bucket_policy = {
+        "Version": "2012-10-17",
+        "Id": "PutObjPolicy",
+        "Statement": [
+            {
+                "Sid": "DenyUnEncryptedObjectUploads",
+                "Effect": "Deny",
+                "Principal": "*",
+                "Action": "s3:PutObject",
+                "Resource": f"arn:aws:s3:::{BUCKET_NAME}/*",
+                "Condition": {"Null": {"s3:x-amz-server-side-encryption": "aws:kms"}},
+            }
+        ],
+    }
+    bucket_policy = json.dumps(bucket_policy)
+    _reset_moto_server()
+    client = _get_boto3_client()
+    client.create_bucket(Bucket=BUCKET_NAME)
+    client.put_bucket_policy(Bucket=BUCKET_NAME, Policy=bucket_policy)
+    yield client
diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py
@@ -1,37 +1,21 @@
-import boto3
+from io import BytesIO
+
 import dask.dataframe as dd
 import pandas as pd
 import pyarrow as pa
-import pyarrow.parquet as pq
 import pytest
-from moto import mock_s3
 from pandas.testing import assert_frame_equal
-from s3fs import S3FileSystem
 
 from kedro_datasets._io import DatasetError
 from kedro_datasets.dask import ParquetDataset
 
 FILE_NAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
-AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}
 
 # Pathlib cannot be used since it strips out the second slash from "s3://"
 S3_PATH = f"s3://{BUCKET_NAME}/{FILE_NAME}"
 
 
-@pytest.fixture
-def mocked_s3_bucket():
-    """Create a bucket for testing using moto."""
-    with mock_s3():
-        conn = boto3.client(
-            "s3",
-            aws_access_key_id="fake_access_key",
-            aws_secret_access_key="fake_secret_key",
-        )
-        conn.create_bucket(Bucket=BUCKET_NAME)
-        yield conn
-
-
 @pytest.fixture
 def dummy_dd_dataframe() -> dd.DataFrame:
     df = pd.DataFrame(
@@ -41,37 +25,28 @@ def dummy_dd_dataframe() -> dd.DataFrame:
 
 
 @pytest.fixture
-def mocked_s3_object(tmp_path, mocked_s3_bucket, dummy_dd_dataframe: dd.DataFrame):
-    """Creates test data and adds it to mocked S3 bucket."""
-    pandas_df = dummy_dd_dataframe.compute()
-    table = pa.Table.from_pandas(pandas_df)
-    temporary_path = tmp_path / FILE_NAME
-    pq.write_table(table, str(temporary_path))
-
-    mocked_s3_bucket.put_object(
-        Bucket=BUCKET_NAME, Key=FILE_NAME, Body=temporary_path.read_bytes()
-    )
-    return mocked_s3_bucket
-
-
-@pytest.fixture
-def s3_dataset(load_args, save_args):
+def s3_dataset(
+    mocked_s3_bucket, credentials, mock_fs_args, save_args, load_args
+):  # pylint: disable=unused-argument
     return ParquetDataset(
         filepath=S3_PATH,
-        credentials=AWS_CREDENTIALS,
+        credentials=credentials,
+        fs_args=mock_fs_args,
         load_args=load_args,
         save_args=save_args,
     )
 
 
-@pytest.fixture()
-def s3fs_cleanup():
-    # clear cache so we get a clean slate every time we instantiate a S3FileSystem
-    yield
-    S3FileSystem.cachable = False
+@pytest.fixture
+def mocked_parquet_in_s3(mocked_s3_bucket, dummy_dd_dataframe):
+    pandas_df = dummy_dd_dataframe.compute()
+    buffer = BytesIO()
+    pandas_df.to_parquet(buffer)
+    buffer.seek(0)
+    mocked_s3_bucket.put_object(Bucket=BUCKET_NAME, Key=FILE_NAME, Body=buffer)
+    return S3_PATH
 
 
-@pytest.mark.usefixtures("s3fs_cleanup")
 class TestParquetDataset:
     def test_incorrect_credentials_load(self):
         """Test that incorrect credential keys won't instantiate dataset."""
@@ -91,22 +66,25 @@ def test_empty_credentials_load(self, bad_credentials):
         with pytest.raises(DatasetError, match=pattern):
             parquet_dataset.load().compute()
 
-    def test_pass_credentials(self, mocker):
-        """Test that AWS credentials are passed successfully into boto3
-        client instantiation on creating S3 connection."""
-        client_mock = mocker.patch("botocore.session.Session.create_client")
-        s3_dataset = ParquetDataset(filepath=S3_PATH, credentials=AWS_CREDENTIALS)
-        pattern = r"Failed while loading data from data set ParquetDataset\(.+\)"
-        with pytest.raises(DatasetError, match=pattern):
-            s3_dataset.load().compute()
+    def test_exists(self, s3_dataset, dummy_dd_dataframe):
+        """Test `exists` method invocation for both existing and
+        nonexistent data set."""
+        assert not s3_dataset.exists()
+        s3_dataset.save(dummy_dd_dataframe)
+        assert s3_dataset.exists()
 
-        assert client_mock.call_count == 1
-        args, kwargs = client_mock.call_args_list[0]
-        assert args == ("s3",)
-        assert kwargs["aws_access_key_id"] == AWS_CREDENTIALS["key"]
-        assert kwargs["aws_secret_access_key"] == AWS_CREDENTIALS["secret"]
+    def test_load_data(
+        self, mocked_parquet_in_s3, mock_fs_args, credentials, dummy_dd_dataframe
+    ):
+        """Test loading the data from S3."""
+        dataset = ParquetDataset(
+            filepath=mocked_parquet_in_s3,
+            credentials=credentials,
+            fs_args=mock_fs_args,
+        )
+        loaded_data = dataset.load()
+        assert_frame_equal(loaded_data.compute(), dummy_dd_dataframe.compute())
 
-    @pytest.mark.usefixtures("mocked_s3_bucket")
     def test_save_data(self, s3_dataset):
         """Test saving the data to S3."""
         pd_data = pd.DataFrame(
@@ -117,20 +95,6 @@ def test_save_data(self, s3_dataset):
         loaded_data = s3_dataset.load()
         assert_frame_equal(loaded_data.compute(), dd_data.compute())
 
-    @pytest.mark.usefixtures("mocked_s3_object")
-    def test_load_data(self, s3_dataset, dummy_dd_dataframe):
-        """Test loading the data from S3."""
-        loaded_data = s3_dataset.load()
-        assert_frame_equal(loaded_data.compute(), dummy_dd_dataframe.compute())
-
-    @pytest.mark.usefixtures("mocked_s3_bucket")
-    def test_exists(self, s3_dataset, dummy_dd_dataframe):
-        """Test `exists` method invocation for both existing and
-        nonexistent data set."""
-        assert not s3_dataset.exists()
-        s3_dataset.save(dummy_dd_dataframe)
-        assert s3_dataset.exists()
-
     def test_save_load_locally(self, tmp_path, dummy_dd_dataframe):
         """Test loading the data locally."""
         file_path = str(tmp_path / "some" / "dir" / FILE_NAME)