From 29d27b9aedbc977e21ce05ca4cfb03c963f425b8 Mon Sep 17 00:00:00 2001 From: Philipp Kessling <p.kessling@leibniz-hbi.de> Date: Tue, 26 Nov 2024 09:21:09 +0100 Subject: [PATCH 1/3] feat: add some unit tests. --- dabapush/Configuration/__init__.py | 1 - tests/Writer/test_ndjson_writer.py | 61 +++++++++++++++++ tests/test_utils.py | 78 ++++++++++++++++++++++ tests/tests/Configuration/__init__.py | 0 tests/tests/Configuration/test_Registry.py | 45 +++++++++++++ tests/tests/__init__.py | 0 6 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 tests/Writer/test_ndjson_writer.py create mode 100644 tests/test_utils.py create mode 100644 tests/tests/Configuration/__init__.py create mode 100644 tests/tests/Configuration/test_Registry.py create mode 100644 tests/tests/__init__.py diff --git a/dabapush/Configuration/__init__.py b/dabapush/Configuration/__init__.py index aba4c6e..e69de29 100644 --- a/dabapush/Configuration/__init__.py +++ b/dabapush/Configuration/__init__.py @@ -1 +0,0 @@ -from .FileWriterConfiguration import FileWriterConfiguration diff --git a/tests/Writer/test_ndjson_writer.py b/tests/Writer/test_ndjson_writer.py new file mode 100644 index 0000000..eb673eb --- /dev/null +++ b/tests/Writer/test_ndjson_writer.py @@ -0,0 +1,61 @@ +"""Test suite for the NDJSONWriter module.""" + +import pytest + +from dabapush.Record import Record +from dabapush.Writer.NDJSONWriter import NDJSONWriterConfiguration + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + [{"key1": "value1"}, {"key2": "value2"}], + '{"key1":"value1"}\n{"key2":"value2"}\n', + ), + ([{"key1": "value1", "key2": "value2"}], '{"key1":"value1","key2":"value2"}\n'), + ], +) +def test_write_ndjson(data, expected, tmp_path): + """Should write records to a file in NDJSON format.""" + configuration = NDJSONWriterConfiguration( + name="test", + id="test", + chunk_size=1, + path=str(tmp_path), + name_template="test.ndjson", + ) + file_path = tmp_path / "test.ndjson" + writer = configuration.get_instance() + writer.write((Record(_) for _ in data)) + + with file_path.open("rt", encoding="utf8") as f: + result = f.read() + + assert result == expected + + +@pytest.mark.parametrize( + "data, expected", + [ + ([{"key1": "value1"}, {"key2": "value2"}], 2), + ([{"key1": "value1", "key2": "value2"}], 1), + ], +) +def test_write_ndjson_line_count(data, expected, tmp_path): + """Should write records to a file in NDJSON format.""" + configuration = NDJSONWriterConfiguration( + name="test", + id="test", + chunk_size=1, + path=str(tmp_path), + name_template="test.ndjson", + ) + file_path = tmp_path / "test.ndjson" + writer = configuration.get_instance() + writer.write((Record(_) for _ in data)) + + with file_path.open("rt", encoding="utf8") as f: + result = f.readlines() + + assert len(result) == expected diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..a72d245 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,78 @@ +"""Test suite for the utils module.""" + +import pytest + +from dabapush.utils import flatten, safe_access, safe_write, unpack + +# pylint: disable=W0622 + + +@pytest.mark.parametrize( + "nested_dict, namespace, expected", + [ + ({"a": {"b": "yuk"}}, None, {"a.b": "yuk"}), + ( + {"a": {"b": "yuk", "c": [{"d": "meh"}]}}, + None, + {"a.b": "yuk", "a.c": [{"d": "meh"}]}, + ), + ({"a": {"b": "yuk"}}, "namespace", {"namespace.a.b": "yuk"}), + ], +) +def test_flatten(nested_dict, namespace, expected): + """Should flatten dicts correctly.""" + assert flatten(nested_dict, namespace=namespace) == expected + + +@pytest.mark.parametrize( + "nested_dict, path, expected", + [ + ({"a": {"b": {"c": "value"}}}, ["a", "b", "c"], "value"), + ({"a": {"b": {"c": "value"}}}, ["a", "b", "d"], None), + ({"a": {"b": {"c": "value"}}}, ["a", "b"], {"c": "value"}), + ], +) +def test_safe_access(nested_dict, path, expected): + """Should safely access nested dicts.""" + assert safe_access(nested_dict, path) == expected + + +@pytest.mark.parametrize( + "nested_dict, path, key, value, expected", + [ + ( + {"a": {"b": {"c": "value"}}}, + ["a", "b"], + "d", + "new_value", + {"a": {"b": {"c": "value", "d": "new_value"}}}, + ), + ( + {"a": {"b": {"c": "value"}}}, + ["a", "b", "e"], + "f", + "another_value", + {"a": {"b": {"c": "value", "e": {"f": "another_value"}}}}, + ), + ], +) +def test_safe_write(nested_dict, path, key, value, expected): + """Should safely write to nested dicts.""" + assert safe_write(nested_dict, path, key, value) == expected + + +@pytest.mark.parametrize( + "includes, id, id_key, expected", + [ + ( + [{"id": "1", "name": "item1"}, {"id": "2", "name": "item2"}], + "1", + "id", + {"id": "1", "name": "item1"}, + ), + ([{"id": "1", "name": "item1"}, {"id": "2", "name": "item2"}], "3", "id", None), + ], +) +def test_unpack(includes, id, id_key, expected): + """Should unpack a dict from a list of dicts.""" + assert unpack(id, includes, id_key) == expected diff --git a/tests/tests/Configuration/__init__.py b/tests/tests/Configuration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tests/Configuration/test_Registry.py b/tests/tests/Configuration/test_Registry.py new file mode 100644 index 0000000..98d10d0 --- /dev/null +++ b/tests/tests/Configuration/test_Registry.py @@ -0,0 +1,45 @@ +"""Test suite for the Registry module.""" + +from importlib.metadata import EntryPoint + +from dabapush.Configuration import Registry +from dabapush.Configuration.ReaderConfiguration import ReaderConfiguration +from dabapush.Configuration.WriterConfiguration import WriterConfiguration + + +def test_readers(): + """Should fetch readers from the reader entry point.""" + assert isinstance(Registry.readers(), list) + assert all(isinstance(_, EntryPoint) for _ in Registry.readers()) + + +def test_writers(): + """Should fetch writers from the writer entry point.""" + assert isinstance(Registry.writers(), list) + assert all(isinstance(_, EntryPoint) for _ in Registry.writers()) + + +def test_get_reader(): + """Should fetch a reader by name.""" + reader = Registry.get_reader("NDJSON") + assert reader is not None + assert issubclass(reader, ReaderConfiguration) + + +def test_get_writer(): + """Should fetch a writer by name.""" + writer = Registry.get_writer("NDJSON") + assert writer is not None + assert issubclass(writer, WriterConfiguration) + + +def test_list_all_readers(): + """Should list all available readers.""" + assert isinstance(Registry.list_all_readers(), list) + assert all(isinstance(_, str) for _ in Registry.list_all_readers()) + + +def test_list_all_writers(): + """Should list all available writers.""" + assert isinstance(Registry.list_all_readers(), list) + assert all(isinstance(_, str) for _ in Registry.list_all_readers()) diff --git a/tests/tests/__init__.py b/tests/tests/__init__.py new file mode 100644 index 0000000..e69de29 From c167f6c208b80ba520b28441ea6adb7dff03510a Mon Sep 17 00:00:00 2001 From: Philipp Kessling <p.kessling@leibniz-hbi.de> Date: Tue, 26 Nov 2024 09:53:33 +0100 Subject: [PATCH 2/3] feat: add unit tests for CSVWriter . --- .../Configuration/FileWriterConfiguration.py | 31 +++++--- dabapush/Writer/CSVWriter.py | 17 ++--- tests/Writer/test_csv_writer.py | 71 +++++++++++++++++++ 3 files changed, 99 insertions(+), 20 deletions(-) create mode 100644 tests/Writer/test_csv_writer.py diff --git a/dabapush/Configuration/FileWriterConfiguration.py b/dabapush/Configuration/FileWriterConfiguration.py index 157f93a..ed8e3cf 100644 --- a/dabapush/Configuration/FileWriterConfiguration.py +++ b/dabapush/Configuration/FileWriterConfiguration.py @@ -1,10 +1,16 @@ """FileWriterConfiguration provides a base class for file-based Writers.""" + +import abc from datetime import datetime from string import Template from typing import Dict, Optional +from loguru import logger as log + from .WriterConfiguration import WriterConfiguration +# pylint: disable=W0221,W0622,R0917,R0913 + class FileWriterConfiguration(WriterConfiguration): """Abstract class describing configuration items for a file based writer""" @@ -33,15 +39,18 @@ def make_file_name(self, additional_keys: Optional[Dict] = None) -> str: Interpolated file name as str. """ now = datetime.now() - return Template(self.name_template).substitute( - **{ - "date": datetime.strftime(now, "%Y-%m-%d"), - "time": datetime.strftime(now, "%H%M"), - "name": self.name, - "id": self.id, - **(additional_keys or {}), - } - ) + available_data = { + "date": datetime.strftime(now, "%Y-%m-%d"), + "time": datetime.strftime(now, "%H%M"), + "chunk_size": self.chunk_size, + "name": self.name, + "id": self.id, + **(additional_keys or {}), + } + + log.info(f"Available data: {available_data}") + + return Template(self.name_template).substitute(**available_data) def set_name_template(self, template: str): """Sets the template string. @@ -51,3 +60,7 @@ def set_name_template(self, template: str): Template string to use. """ self.name_template = template + + @abc.abstractmethod + def get_instance(self) -> object or None: + """Get configured instance of Writer""" diff --git a/dabapush/Writer/CSVWriter.py b/dabapush/Writer/CSVWriter.py index c9400e0..6fa8e59 100644 --- a/dabapush/Writer/CSVWriter.py +++ b/dabapush/Writer/CSVWriter.py @@ -10,6 +10,8 @@ from ..Configuration.FileWriterConfiguration import FileWriterConfiguration from .Writer import Writer +# pylint: disable=R0917 + class CSVWriter(Writer): """Writes CSVs from buffered stream""" @@ -23,11 +25,9 @@ def persist(self): """persist buffer to disk""" last_rows = self.buffer - self.buffer = [] - log.info(f"Persisted {len(last_rows)} records") _path = Path(self.config.path) / self.config.make_file_name( - {"chunk_number": self.chunk_number} + {"chunk_number": self.chunk_number, "type": "csv"} ) pd.DataFrame( (a.payload for a in last_rows), @@ -35,6 +35,9 @@ def persist(self): r"\n|\r", r"\\n", regex=True ).to_csv(_path, index=False) self.chunk_number += 1 + self.buffer = [] + + log.info(f"Persisted {len(last_rows)} records") return len(last_rows) @@ -57,14 +60,6 @@ def __init__( # pylint: disable=R0913 ) self.type = "csv" - @property - def file_path(self) -> Path: - """get the path to a file to write in""" - # evalutate self.name_template - file_name = self.make_file_name({"type": "csv"}) - # append to self.path and return - return Path(self.path) / file_name - def get_instance(self): # pylint: disable=W0221 """get configured instance of CSVWriter""" return CSVWriter(self) diff --git a/tests/Writer/test_csv_writer.py b/tests/Writer/test_csv_writer.py new file mode 100644 index 0000000..9581dd5 --- /dev/null +++ b/tests/Writer/test_csv_writer.py @@ -0,0 +1,71 @@ +"""Tests for CSVWriter class.""" + +import pytest + +from dabapush.Record import Record +from dabapush.Writer.CSVWriter import CSVWriterConfiguration + +# pylint: disable=W0621 + + +@pytest.fixture +def config_factory(): + """Return a factory for CSVWriterConfiguration. + + Parameters + ---------- + path : str + The path to write to. + + Returns + ------- + function + A factory function that returns a CSVWriterConfiguration. + """ + yield lambda path: CSVWriterConfiguration( + name="test", chunk_size=1000000, path=str(path) + ) + + +@pytest.mark.parametrize( + "data, expected", + [ + ([{"key1": "value1"}, {"key2": "value2"}], "key1,key2\nvalue1,\n,value2\n"), + ([{"key1": "value1", "key2": "value2"}], "key1,key2\nvalue1,value2\n"), + ], +) +def test_write_csv(data, expected, config_factory, tmp_path): + """Should write the correct data to the file.""" + config = config_factory(path=tmp_path) + writer = config.get_instance() + writer.write((Record(payload=d) for d in data)) + writer.persist() + + files = tmp_path.glob("*.csv") + + data = [file.read_text() for file in files] + + assert data[0] == expected + + +@pytest.mark.parametrize( + "data, expected", + [ + ([{"key1": "value1"}, {"key2": "value2"}], 3), # 1 header + 2 data rows + ([{"key1": "value1", "key2": "value2"}], 2), # 1 header + 1 data row + ], +) +def test_write_csv_line_count(data, expected, config_factory, tmp_path): + """Should write the correct number of lines to the file.""" + config = config_factory(path=tmp_path) + writer = config.get_instance() + writer.write((Record(payload=d) for d in data)) + writer.persist() + + files = tmp_path.glob("*.csv") + + data = [file.read_text() for file in files] + + print(data) + + assert (len(data[0].split("\n")) - 1) == expected From 3e80768107cb461326902542f5f1571302b85f50 Mon Sep 17 00:00:00 2001 From: Philipp Kessling <p.kessling@leibniz-hbi.de> Date: Tue, 26 Nov 2024 09:54:17 +0100 Subject: [PATCH 3/3] fix: not an abstract classmethod. --- dabapush/Configuration/PlugInConfiguration.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dabapush/Configuration/PlugInConfiguration.py b/dabapush/Configuration/PlugInConfiguration.py index 441266b..ec4706f 100644 --- a/dabapush/Configuration/PlugInConfiguration.py +++ b/dabapush/Configuration/PlugInConfiguration.py @@ -20,7 +20,6 @@ def __init__(self, name: str, id: str or None) -> None: self.name = name self.id = id if id is not None else str(uuid4()) - @classmethod @abc.abstractmethod - def get_instance(cls) -> object or None: + def get_instance(self) -> object or None: """Get a configured instance of the appropriate reader or writer plugin."""