Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:quantumblacklabs/private-kedro i…
Browse files Browse the repository at this point in the history
…nto merge-master-to-develop
  • Loading branch information
Jiri Klein committed Jun 17, 2021
2 parents d5c598b + e1251eb commit 72203bd
Show file tree
Hide file tree
Showing 73 changed files with 1,292 additions and 1,594 deletions.
128 changes: 76 additions & 52 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ orbs:
win: circleci/[email protected]

executors:
py36:
docker:
- image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.6
py37:
docker:
- image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.7
py38:
docker:
- image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.8
py39:
docker:
- image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.9

commands:
setup_conda:
Expand All @@ -24,6 +24,11 @@ commands:
- run:
name: Activate conda environment
command: echo "conda deactivate; conda activate kedro_builder" >> $BASH_ENV
- run:
# pytables does not work properly with python 3.9 to handle our HDFDataSet
# if pip-installed, so we install this dependency via conda
name: Install conda packages
command: echo "conda install -c conda-forge pytables -y" >> $BASH_ENV

setup_requirements:
description: Install PIP dependencies
Expand Down Expand Up @@ -184,6 +189,11 @@ commands:
win_setup_requirements:
description: Install Kedro dependencies
steps:
# pytables and Fiona have a series of binary dependencies under Windows that
# are best handled by conda-installing instead of pip-installing them.
- run:
name: Install pytables
command: conda activate kedro_builder; conda install -c conda-forge pytables -y
- run:
name: Install Fiona
command: conda activate kedro_builder; conda install -c conda-forge fiona -y
Expand Down Expand Up @@ -266,22 +276,6 @@ commands:
command: conda activate kedro_builder; make pip-compile

jobs:
unit_tests_36:
executor: py36
steps: [unit_tests]

linters_36:
executor: py36
steps: [lint]

e2e_tests_36:
executor: py36
steps: [e2e_tests]

docs_36:
executor: py36
steps: [build_docs]

unit_tests_37:
executor: py37
steps: [unit_tests]
Expand Down Expand Up @@ -320,9 +314,23 @@ jobs:
executor: py38
steps: [e2e_tests]

pip_compile_36:
executor: py36
steps: [pip_compile]
unit_tests_39:
executor: py39
steps:
- checkout
- setup_conda
- setup_requirements
- run:
name: Run unit tests without Spark
command: make test-no-spark

linters_39:
executor: py39
steps: [lint]

e2e_tests_39:
executor: py39
steps: [e2e_tests]

pip_compile_37:
executor: py37
Expand All @@ -332,6 +340,10 @@ jobs:
executor: py38
steps: [pip_compile]

pip_compile_39:
executor: py39
steps: [pip_compile]

all_circleci_checks_succeeded:
docker:
- image: circleci/python # any light-weight image
Expand All @@ -342,14 +354,6 @@ jobs:
command: echo "All checks passed"

# Windows-related jobs
win_unit_tests_36:
executor:
name: win/default
steps:
- win_setup_conda:
python_version: "3.6"
- win_unit_tests

win_unit_tests_37:
executor:
name: win/default
Expand Down Expand Up @@ -380,13 +384,27 @@ jobs:
pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --ignore .\tests\runner\test_parallel_runner.py --no-cov
if ($?) { pytest .\tests\runner\test_parallel_runner.py --no-cov }
win_e2e_tests_36:
win_unit_tests_39:
executor:
name: win/default
steps:
- win_setup_conda:
python_version: "3.6"
- win_e2e_tests
python_version: "3.9"
- checkout
- win_setup_env
- restore_cache:
key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }}
- win_setup_requirements
- run:
name: Set HDF5_DISABLE_VERSION_CHECK environment variable
command: setx /m HDF5_DISABLE_VERSION_CHECK 1
- run:
name: Run unit tests without Spark and TensorFlow
# Run `test_parallel_runner.py` separately because of `Windows fatal exception: stack overflow`
command: |
conda activate kedro_builder
pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --ignore .\tests\runner\test_parallel_runner.py --no-cov
if ($?) { pytest .\tests\runner\test_parallel_runner.py --no-cov }
win_e2e_tests_37:
executor:
Expand All @@ -404,13 +422,13 @@ jobs:
python_version: "3.8"
- win_e2e_tests

win_pip_compile_36:
win_e2e_tests_39:
executor:
name: win/default
steps:
- win_setup_conda:
python_version: "3.6"
- win_pip_compile
python_version: "3.9"
- win_e2e_tests

win_pip_compile_37:
executor:
Expand All @@ -430,14 +448,18 @@ jobs:
python_version: "3.8"
- win_pip_compile

win_pip_compile_39:
executor:
name: win/default
steps:
- win_setup_conda:
python_version: "3.9"
- win_pip_compile

workflows:
version: 2
regular:
jobs:
- unit_tests_36
- linters_36
- e2e_tests_36
- docs_36
- docs_linkcheck_37
- unit_tests_37
- linters_37
Expand All @@ -446,24 +468,23 @@ workflows:
- unit_tests_38
- linters_38
- e2e_tests_38
- pip_compile_36
- unit_tests_39
- linters_39
- e2e_tests_39
- pip_compile_37
- pip_compile_38
- win_unit_tests_36
- pip_compile_39
- win_unit_tests_37
- win_unit_tests_38
- win_pip_compile_36
- win_unit_tests_39
- win_pip_compile_37
- win_pip_compile_38
- win_e2e_tests_36
- win_pip_compile_39
- win_e2e_tests_37
- win_e2e_tests_38
- win_e2e_tests_39
- all_circleci_checks_succeeded:
requires:
- unit_tests_36
- linters_36
- e2e_tests_36
- docs_36
- unit_tests_37
- linters_37
- e2e_tests_37
Expand All @@ -472,16 +493,19 @@ workflows:
- unit_tests_38
- linters_38
- e2e_tests_38
- pip_compile_36
- unit_tests_39
- linters_39
- e2e_tests_39
- pip_compile_37
- pip_compile_38
- win_pip_compile_36
- pip_compile_39
- win_pip_compile_37
- win_pip_compile_38
- win_unit_tests_36
- win_pip_compile_39
- win_unit_tests_37
- win_e2e_tests_36
# Skipped due to Windows fatal exception: stack overflow
# - win_unit_tests_38
# - win_unit_tests_39
- win_e2e_tests_37
- win_e2e_tests_38
- win_e2e_tests_39
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ kedro.db
.*.swo
.*.swp

# Prettier
.prettierignore

.pytest_cache/
kedro/html
docs/tmp-build-artifacts
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
exclude: "^kedro/templates/|^features/steps/test_starter/"

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.720
rev: v0.812
hooks:
- id: mypy
args: [--allow-redefinition, --ignore-missing-imports]
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ formats:

# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.6
version: 3.7
install:
- method: pip
path: .
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ make build-docs
* We use [Anaconda](https://www.anaconda.com/distribution/) as a preferred virtual environment
* We use [SemVer](https://semver.org/) for versioning

Our code is designed to be compatible with Python 3.6 onwards and our style guidelines are (in cascading order):
Our code is designed to be compatible with Python 3.7 onwards and our style guidelines are (in cascading order):

* [PEP 8 conventions](https://www.python.org/dev/peps/pep-0008/) for all Python code
* [Google docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for code comments
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![Kedro Logo Banner](https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/img/kedro_banner.png)

[![Python version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue.svg)](https://pypi.org/project/kedro/)
[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg)](https://pypi.org/project/kedro/)
[![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/)
[![Conda version](https://img.shields.io/conda/vn/conda-forge/kedro.svg)](https://anaconda.org/conda-forge/kedro)
[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/quantumblacklabs/kedro/blob/master/LICENSE.md)
Expand Down
34 changes: 33 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
# Upcoming Release 0.18.0

## Major features and improvements
* Added support for Python 3.9, dropped support for Python 3.6.
* Support specifying parameters mapping in `pipeline()` without the `params:` prefix.
* Added new API `Pipeline.filter()` (previously in `KedroContext._filter_pipeline()`) to filter parts of a pipeline.
* Added `partitionBy` support and exposed `save_args` for `SparkHiveDataSet`.
* Exposed `open_args_save` in `fs_args` for `pandas.ParquetDataSet`.

## Breaking changes to the API
* Add namespace to parameters in a modular pipeline, which addresses [Issue 399](https://github.com/quantumblacklabs/kedro/issues/399)
* `pandas.ExcelDataSet` now uses `openpyxl` engine instead of `xlrd`.
* `pandas.ParquetDataSet` now calls `pd.to_parquet()` upon saving. Note that the argument `partition_cols` is not supported.
* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline contains no nodes. The same `ValueError` is raised when there are no matching tags.
* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline name doesn't exist in the pipeline registry.
* Removed deprecated functions `load_context` and `get_project_context`.
* `spark.SparkHiveDataSet` API has been updated to reflect `spark.SparkDataSet`. The `write_mode=insert` option has also been replaced with `write_mode=append` as per Spark styleguide. This change addresses [Issue 725](https://github.com/quantumblacklabs/kedro/issues/725) and [Issue 745](https://github.com/quantumblacklabs/kedro/issues/745). Additionally, `upsert` mode now leverages `checkpoint` functionality and requires a valid `checkpointDir` be set for current `SparkContext`.
* Deprecated and removed `ProjectHooks.register_config_loader` `hook_spec` in favour of loading `CONFIG_LOADER_CLASS` directly from `settings.py`. The default option for `CONFIG_LOADER_CLASS` is now set to `kedro.config.ConfigLoader`.
* Added `CONFIG_LOADER_ARGS` to `settings.py` to facilitate the provision of additional keyword arguments to the constructor of the project `config_loader`. The default option for `CONFIG_LOADER_ARGS` is an empty dictionary.
* `yaml.YAMLDataSet` can no longer save a `pandas.DataFrame` directly, but it can save a dictionary. Use `pandas.DataFrame.to_dict()` to convert your `pandas.DataFrame` to a dictionary before you attempt to save it to YAML.
* Deprecated `--version` CLI option for `kedro pipeline package` command. Specific pipeline package version can be added by setting the `__version__` variable in the pipeline package's `__init__.py` file.

## Migration guide from Kedro 0.17.* to 0.18.*
* Please remove any existing `hook_impl` of the `register_config_loader` method from `ProjectHooks` (or custom alternatives).
* Populate `settings.py` with `CONFIG_LOADER_CLASS` set to your expected config loader class (for example `kedro.config.TemplatedConfigLoader` or custom implementation). If `CONFIG_LOADER_CLASS` value is not set, it will default to `kedro.config.ConfigLoader` at runtime.
* Populate `settings.py` with `CONFIG_LOADER_ARGS` set to a dictionary with expected keyword arguments. If `CONFIG_LOADER_ARGS` is not set, it will default to an empty dictionary.
* Optional: You can now remove all `params:` prefix when supplying values to `parameters` argument in a `pipeline()` call.
* If you're using `pandas.ExcelDataSet`, make sure you have `openpyxl` installed in your environment. Note that this is automatically pulled if you specify `kedro[pandas.ExcelDataSet]==0.18.0` in your `requirements.in`. You can uninstall `xlrd` if you were only using it for this dataset.
* If you're using `pandas.ParquetDataSet`, please pass pandas saving arguments directly to `save_args` instead of nested in `from_pandas` (e.g. `save_args = {"preserve_index": False}` instead of `save_args = {"from_pandas": {"preserve_index": False}}`).
* If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `insert`, please update this to `append` in line with the Spark styleguide. If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `upsert`, please make sure that your `SparkContext` has a valid `checkpointDir` set either by `SparkContext.setCheckpointDir` method or directly in the `conf` folder.
* Edit any scripts containing `kedro pipeline package --version` to remove the `--version` option. If you wish to set a specific pipeline package version, set the `__version__` variable in the pipeline package's `__init__.py` file.

# Release 0.17.4

## Major features and improvements
Expand All @@ -16,7 +48,7 @@
* `kedro info` now outputs if a plugin has any `hooks` or `cli_hooks` implemented.
* `PartitionedDataSet` now supports lazily materializing data on save.
* `kedro pipeline describe` now defaults to the `__default__` pipeline when no pipeline name is provided and also shows the namespace the nodes belong to.
* Fixed an issue where spark.SparkDataSet with enabled versioning would throw a VersionNotFoundError when using databricks-connect from a remote machine and saving to dbfs filesystem.
* Fixed an issue where `spark.SparkDataSet` with enabled versioning would throw a `VersionNotFoundError` when using `databricks-connect` from a remote machine and saving to `dbfs` filesystem.
* `EmailMessageDataSet` added to doctree.
* When node inputs do not pass validation, the error message is now shown as the most recent exception in the traceback ([Issue #761](https://github.com/quantumblacklabs/kedro/issues/761)).
* `kedro pipeline package` now only packages the parameter file that exactly matches the pipeline name specified and the parameter files in a directory with the pipeline name.
Expand Down
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@
"https://zenodo.org/badge/latestdoi/182067506",
"https://eternallybored.org/misc/wget/",
"https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas",
"https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog" # temporary until 0.18
]

# retry before render a link broken (fix for "too many requests")
Expand Down
2 changes: 1 addition & 1 deletion docs/source/01_introduction/01_introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ We also recommend the [frequently asked questions](../12_faq/01_faq.md) and the
We have designed the documentation and the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) for anyone new to Kedro. The more knowledge of Python you have, the easier you will find the learning curve.

```eval_rst
.. note:: There are a number of excellent online resources for learning Python, but you should choose those that reference Python 3, as Kedro is built for Python 3.6+. There are many curated lists of online resources, such as the `official Python programming language website <https://www.python.org/>`_ and `this list of free programming books and tutorials <https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books.md#python>`_.
.. note:: There are a number of excellent online resources for learning Python, but you should choose those that reference Python 3, as Kedro is built for Python 3.7+. There are many curated lists of online resources, such as the `official Python programming language website <https://www.python.org/>`_ and `this list of free programming books and tutorials <https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books.md#python>`_.
```
2 changes: 1 addition & 1 deletion docs/source/02_get_started/05_example_project.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ This is the data engineering node function within `src/get_started/pipelines/dat
| Node | Description | Node Function Name |
+=================+================================================================+==========================+
| Split data | Splits the example | :code:`split_data` |
| | `Iris dataset <https://archive.ics.uci.edu/ml/datasets/iris>`_ | |
| | `Iris dataset <https://www.kaggle.com/uciml/iris>` | |
| | into train and test samples | |
+-----------------+----------------------------------------------------------------+--------------------------+
```
Expand Down
2 changes: 1 addition & 1 deletion docs/source/02_get_started/06_starters.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ kedro starter list
The Kedro team maintains the following starters to bootstrap new Kedro projects:

* [Alias `astro-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/astro-iris): The [Kedro Iris dataset example project](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/).
* [Alias `mini-kedro`](https://github.com/quantumblacklabs/kedro-starters/tree/master/mini-kedro): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, please read the [Mini-Kedro](../04_kedro_project_setup/04_mini_kedro.md) guide.
* [Alias `standalone-datacatalog`](https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, read the guide to [standalone use of the `DataCatalog`](../02_get_started/07_standalone_use_of_datacatalog.md). This starter was formerly known as `mini-kedro`.
* [Alias `pandas-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris): The [Kedro Iris dataset example project](./05_example_project.md)
* [Alias `pyspark-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../11_tools_integration/01_pyspark.md)
* [Alias `pyspark`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../11_tools_integration/01_pyspark.md)
Expand Down
Loading

0 comments on commit 72203bd

Please sign in to comment.