Merge branch 'develop' of github.com:quantumblacklabs/private-kedro i…

…nto merge-master-to-develop
kedro-org · Jun 17, 2021 · 72203bd · 72203bd
2 parents d5c598b + e1251eb
commit 72203bd
Show file tree

Hide file tree

Showing 73 changed files with 1,292 additions and 1,594 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -4,15 +4,15 @@ orbs:
   win: circleci/[email protected]
 
 executors:
-  py36:
-    docker:
-      - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.6
   py37:
     docker:
       - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.7
   py38:
     docker:
       - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.8
+  py39:
+    docker:
+      - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.9
 
 commands:
   setup_conda:
@@ -24,6 +24,11 @@ commands:
       - run:
           name: Activate conda environment
           command: echo "conda deactivate; conda activate kedro_builder" >> $BASH_ENV
+      - run:
+          # pytables does not work properly with python 3.9 to handle our HDFDataSet
+          # if pip-installed, so we install this dependency via conda
+          name: Install conda packages
+          command: echo "conda install -c conda-forge pytables -y" >> $BASH_ENV
 
   setup_requirements:
     description: Install PIP dependencies
@@ -184,6 +189,11 @@ commands:
   win_setup_requirements:
     description: Install Kedro dependencies
     steps:
+      # pytables and Fiona have a series of binary dependencies under Windows that
+      # are best handled by conda-installing instead of pip-installing them.
+      - run:
+          name: Install pytables
+          command: conda activate kedro_builder; conda install -c conda-forge pytables -y
       - run:
           name: Install Fiona
           command: conda activate kedro_builder; conda install -c conda-forge fiona -y
@@ -266,22 +276,6 @@ commands:
           command: conda activate kedro_builder; make pip-compile
 
 jobs:
-  unit_tests_36:
-    executor: py36
-    steps: [unit_tests]
-
-  linters_36:
-    executor: py36
-    steps: [lint]
-
-  e2e_tests_36:
-    executor: py36
-    steps: [e2e_tests]
-
-  docs_36:
-    executor: py36
-    steps: [build_docs]
-
   unit_tests_37:
     executor: py37
     steps: [unit_tests]
@@ -320,9 +314,23 @@ jobs:
     executor: py38
     steps: [e2e_tests]
 
-  pip_compile_36:
-    executor: py36
-    steps: [pip_compile]
+  unit_tests_39:
+    executor: py39
+    steps:
+      - checkout
+      - setup_conda
+      - setup_requirements
+      - run:
+          name: Run unit tests without Spark
+          command: make test-no-spark
+
+  linters_39:
+    executor: py39
+    steps: [lint]
+
+  e2e_tests_39:
+    executor: py39
+    steps: [e2e_tests]
 
   pip_compile_37:
     executor: py37
@@ -332,6 +340,10 @@ jobs:
     executor: py38
     steps: [pip_compile]
 
+  pip_compile_39:
+    executor: py39
+    steps: [pip_compile]
+
   all_circleci_checks_succeeded:
     docker:
       - image: circleci/python # any light-weight image
@@ -342,14 +354,6 @@ jobs:
           command: echo "All checks passed"
 
   # Windows-related jobs
-  win_unit_tests_36:
-    executor:
-      name: win/default
-    steps:
-      - win_setup_conda:
-          python_version: "3.6"
-      - win_unit_tests
-
   win_unit_tests_37:
     executor:
       name: win/default
@@ -380,13 +384,27 @@ jobs:
             pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --ignore .\tests\runner\test_parallel_runner.py --no-cov
             if ($?) { pytest .\tests\runner\test_parallel_runner.py --no-cov }
 
-  win_e2e_tests_36:
+  win_unit_tests_39:
     executor:
       name: win/default
     steps:
       - win_setup_conda:
-          python_version: "3.6"
-      - win_e2e_tests
+          python_version: "3.9"
+      - checkout
+      - win_setup_env
+      - restore_cache:
+          key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }}
+      - win_setup_requirements
+      - run:
+          name: Set HDF5_DISABLE_VERSION_CHECK environment variable
+          command: setx /m HDF5_DISABLE_VERSION_CHECK 1
+      - run:
+          name: Run unit tests without Spark and TensorFlow
+          # Run `test_parallel_runner.py` separately because of `Windows fatal exception: stack overflow`
+          command: |
+            conda activate kedro_builder
+            pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --ignore .\tests\runner\test_parallel_runner.py --no-cov
+            if ($?) { pytest .\tests\runner\test_parallel_runner.py --no-cov }
 
   win_e2e_tests_37:
     executor:
@@ -404,13 +422,13 @@ jobs:
           python_version: "3.8"
       - win_e2e_tests
 
-  win_pip_compile_36:
+  win_e2e_tests_39:
     executor:
       name: win/default
     steps:
       - win_setup_conda:
-          python_version: "3.6"
-      - win_pip_compile
+          python_version: "3.9"
+      - win_e2e_tests
 
   win_pip_compile_37:
     executor:
@@ -430,14 +448,18 @@ jobs:
           python_version: "3.8"
       - win_pip_compile
 
+  win_pip_compile_39:
+    executor:
+      name: win/default
+    steps:
+      - win_setup_conda:
+          python_version: "3.9"
+      - win_pip_compile
+
 workflows:
   version: 2
   regular:
     jobs:
-      - unit_tests_36
-      - linters_36
-      - e2e_tests_36
-      - docs_36
       - docs_linkcheck_37
       - unit_tests_37
       - linters_37
@@ -446,24 +468,23 @@ workflows:
       - unit_tests_38
       - linters_38
       - e2e_tests_38
-      - pip_compile_36
+      - unit_tests_39
+      - linters_39
+      - e2e_tests_39
       - pip_compile_37
       - pip_compile_38
-      - win_unit_tests_36
+      - pip_compile_39
       - win_unit_tests_37
       - win_unit_tests_38
-      - win_pip_compile_36
+      - win_unit_tests_39
       - win_pip_compile_37
       - win_pip_compile_38
-      - win_e2e_tests_36
+      - win_pip_compile_39
       - win_e2e_tests_37
       - win_e2e_tests_38
+      - win_e2e_tests_39
       - all_circleci_checks_succeeded:
           requires:
-            - unit_tests_36
-            - linters_36
-            - e2e_tests_36
-            - docs_36
             - unit_tests_37
             - linters_37
             - e2e_tests_37
@@ -472,16 +493,19 @@ workflows:
             - unit_tests_38
             - linters_38
             - e2e_tests_38
-            - pip_compile_36
+            - unit_tests_39
+            - linters_39
+            - e2e_tests_39
             - pip_compile_37
             - pip_compile_38
-            - win_pip_compile_36
+            - pip_compile_39
             - win_pip_compile_37
             - win_pip_compile_38
-            - win_unit_tests_36
+            - win_pip_compile_39
             - win_unit_tests_37
-            - win_e2e_tests_36
             # Skipped due to Windows fatal exception: stack overflow
             # - win_unit_tests_38
+            # - win_unit_tests_39
             - win_e2e_tests_37
             - win_e2e_tests_38
+            - win_e2e_tests_39
diff --git a/.gitignore b/.gitignore
@@ -146,6 +146,9 @@ kedro.db
 .*.swo
 .*.swp
 
+# Prettier
+.prettierignore
+
 .pytest_cache/
 kedro/html
 docs/tmp-build-artifacts

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
             exclude: "^kedro/templates/|^features/steps/test_starter/"
 
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: v0.720
+      rev: v0.812
       hooks:
           - id: mypy
             args: [--allow-redefinition, --ignore-missing-imports]

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -21,7 +21,7 @@ formats:
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: 3.6
+  version: 3.7
   install:
     - method: pip
       path: .

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -88,7 +88,7 @@ make build-docs
 * We use [Anaconda](https://www.anaconda.com/distribution/) as a preferred virtual environment
 * We use [SemVer](https://semver.org/) for versioning
 
-Our code is designed to be compatible with Python 3.6 onwards and our style guidelines are (in cascading order):
+Our code is designed to be compatible with Python 3.7 onwards and our style guidelines are (in cascading order):
 
 * [PEP 8 conventions](https://www.python.org/dev/peps/pep-0008/) for all Python code
 * [Google docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for code comments

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 ![Kedro Logo Banner](https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/img/kedro_banner.png)
 
-[![Python version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue.svg)](https://pypi.org/project/kedro/)
+[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg)](https://pypi.org/project/kedro/)
 [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/)
 [![Conda version](https://img.shields.io/conda/vn/conda-forge/kedro.svg)](https://anaconda.org/conda-forge/kedro)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/quantumblacklabs/kedro/blob/master/LICENSE.md)

diff --git a/RELEASE.md b/RELEASE.md
@@ -1,3 +1,35 @@
+# Upcoming Release 0.18.0
+
+## Major features and improvements
+* Added support for Python 3.9, dropped support for Python 3.6.
+* Support specifying parameters mapping in `pipeline()` without the `params:` prefix.
+* Added new API `Pipeline.filter()` (previously in `KedroContext._filter_pipeline()`) to filter parts of a pipeline.
+* Added `partitionBy` support and exposed `save_args` for `SparkHiveDataSet`.
+* Exposed `open_args_save` in `fs_args` for `pandas.ParquetDataSet`.
+
+## Breaking changes to the API
+* Add namespace to parameters in a modular pipeline, which addresses [Issue 399](https://github.com/quantumblacklabs/kedro/issues/399)
+* `pandas.ExcelDataSet` now uses `openpyxl` engine instead of `xlrd`.
+* `pandas.ParquetDataSet` now calls `pd.to_parquet()` upon saving. Note that the argument `partition_cols` is not supported.
+* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline contains no nodes. The same `ValueError` is raised when there are no matching tags.
+* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline name doesn't exist in the pipeline registry.
+* Removed deprecated functions `load_context` and `get_project_context`.
+* `spark.SparkHiveDataSet` API has been updated to reflect `spark.SparkDataSet`. The `write_mode=insert` option has also been replaced with `write_mode=append` as per Spark styleguide. This change addresses [Issue 725](https://github.com/quantumblacklabs/kedro/issues/725) and [Issue 745](https://github.com/quantumblacklabs/kedro/issues/745). Additionally, `upsert` mode now leverages `checkpoint` functionality and requires a valid `checkpointDir` be set for current `SparkContext`.
+* Deprecated and removed `ProjectHooks.register_config_loader` `hook_spec` in favour of loading `CONFIG_LOADER_CLASS` directly from `settings.py`. The default option for `CONFIG_LOADER_CLASS` is now set to `kedro.config.ConfigLoader`.
+* Added `CONFIG_LOADER_ARGS` to `settings.py` to facilitate the provision of additional keyword arguments to the constructor of the project `config_loader`. The default option for `CONFIG_LOADER_ARGS` is an empty dictionary.
+* `yaml.YAMLDataSet` can no longer save a `pandas.DataFrame` directly, but it can save a dictionary. Use `pandas.DataFrame.to_dict()` to convert your `pandas.DataFrame` to a dictionary before you attempt to save it to YAML.
+* Deprecated `--version` CLI option for `kedro pipeline package` command. Specific pipeline package version can be added by setting the `__version__` variable in the pipeline package's `__init__.py` file.
+
+## Migration guide from Kedro 0.17.* to 0.18.*
+* Please remove any existing `hook_impl` of the `register_config_loader` method from `ProjectHooks` (or custom alternatives).
+* Populate `settings.py` with `CONFIG_LOADER_CLASS` set to your expected config loader class (for example `kedro.config.TemplatedConfigLoader` or custom implementation). If `CONFIG_LOADER_CLASS` value is not set, it will default to `kedro.config.ConfigLoader` at runtime.
+* Populate `settings.py` with `CONFIG_LOADER_ARGS` set to a dictionary with expected keyword arguments. If `CONFIG_LOADER_ARGS` is not set, it will default to an empty dictionary.
+* Optional: You can now remove all `params:` prefix when supplying values to `parameters` argument in a `pipeline()` call.
+* If you're using `pandas.ExcelDataSet`, make sure you have `openpyxl` installed in your environment. Note that this is automatically pulled if you specify `kedro[pandas.ExcelDataSet]==0.18.0` in your `requirements.in`. You can uninstall `xlrd` if you were only using it for this dataset.
+* If you're using `pandas.ParquetDataSet`, please pass pandas saving arguments directly to `save_args` instead of nested in `from_pandas` (e.g. `save_args = {"preserve_index": False}` instead of `save_args = {"from_pandas": {"preserve_index": False}}`).
+* If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `insert`, please update this to `append` in line with the Spark styleguide. If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `upsert`, please make sure that your `SparkContext` has a valid `checkpointDir` set either by `SparkContext.setCheckpointDir` method or directly in the `conf` folder.
+* Edit any scripts containing `kedro pipeline package --version` to remove the `--version` option. If you wish to set a specific pipeline package version, set the `__version__` variable in the pipeline package's `__init__.py` file.
+
 # Release 0.17.4
 
 ## Major features and improvements
@@ -16,7 +48,7 @@
 * `kedro info` now outputs if a plugin has any `hooks` or `cli_hooks` implemented.
 * `PartitionedDataSet` now supports lazily materializing data on save.
 * `kedro pipeline describe` now defaults to the `__default__` pipeline when no pipeline name is provided and also shows the namespace the nodes belong to.
-* Fixed an issue where spark.SparkDataSet with enabled versioning would throw a VersionNotFoundError when using databricks-connect from a remote machine and saving to dbfs filesystem.
+* Fixed an issue where `spark.SparkDataSet` with enabled versioning would throw a `VersionNotFoundError` when using `databricks-connect` from a remote machine and saving to `dbfs` filesystem.
 * `EmailMessageDataSet` added to doctree.
 * When node inputs do not pass validation, the error message is now shown as the most recent exception in the traceback ([Issue #761](https://github.com/quantumblacklabs/kedro/issues/761)).
 * `kedro pipeline package` now only packages the parameter file that exactly matches the pipeline name specified and the parameter files in a directory with the pipeline name.

diff --git a/docs/conf.py b/docs/conf.py
@@ -198,6 +198,7 @@
     "https://zenodo.org/badge/latestdoi/182067506",
     "https://eternallybored.org/misc/wget/",
     "https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas",
+    "https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog"     # temporary until 0.18
 ]
 
 # retry before render a link broken (fix for "too many requests")

diff --git a/docs/source/01_introduction/01_introduction.md b/docs/source/01_introduction/01_introduction.md
@@ -21,6 +21,6 @@ We also recommend the [frequently asked questions](../12_faq/01_faq.md) and the
 We have designed the documentation and the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) for anyone new to Kedro. The more knowledge of Python you have, the easier you will find the learning curve.
 
 ```eval_rst
-.. note::  There are a number of excellent online resources for learning Python, but you should choose those that reference Python 3, as Kedro is built for Python 3.6+. There are many curated lists of online resources, such as the `official Python programming language website <https://www.python.org/>`_ and `this list of free programming books and tutorials <https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books.md#python>`_.
+.. note::  There are a number of excellent online resources for learning Python, but you should choose those that reference Python 3, as Kedro is built for Python 3.7+. There are many curated lists of online resources, such as the `official Python programming language website <https://www.python.org/>`_ and `this list of free programming books and tutorials <https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books.md#python>`_.
 
 ```
diff --git a/docs/source/02_get_started/05_example_project.md b/docs/source/02_get_started/05_example_project.md
@@ -123,7 +123,7 @@ This is the data engineering node function within `src/get_started/pipelines/dat
 | Node            | Description                                                    | Node Function Name       |
 +=================+================================================================+==========================+
 | Split data      | Splits the example                                             | :code:`split_data`       |
-|                 | `Iris dataset <https://archive.ics.uci.edu/ml/datasets/iris>`_ |                          |
+|                 | `Iris dataset <https://www.kaggle.com/uciml/iris>`             |                          |
 |                 | into train and test samples                                    |                          |
 +-----------------+----------------------------------------------------------------+--------------------------+
 ```

diff --git a/docs/source/02_get_started/06_starters.md b/docs/source/02_get_started/06_starters.md
@@ -47,7 +47,7 @@ kedro starter list
 The Kedro team maintains the following starters to bootstrap new Kedro projects:
 
 * [Alias `astro-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/astro-iris): The [Kedro Iris dataset example project](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/).
-* [Alias `mini-kedro`](https://github.com/quantumblacklabs/kedro-starters/tree/master/mini-kedro): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, please read the [Mini-Kedro](../04_kedro_project_setup/04_mini_kedro.md) guide.
+* [Alias `standalone-datacatalog`](https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, read the guide to [standalone use of the `DataCatalog`](../02_get_started/07_standalone_use_of_datacatalog.md). This starter was formerly known as `mini-kedro`.
 * [Alias `pandas-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris): The [Kedro Iris dataset example project](./05_example_project.md)
 * [Alias `pyspark-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../11_tools_integration/01_pyspark.md)
 * [Alias `pyspark`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../11_tools_integration/01_pyspark.md)