From 94eb3e02b1ff6f004651f812e5c37185736d63b3 Mon Sep 17 00:00:00 2001 From: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> Date: Sun, 1 Dec 2024 04:44:26 +0000 Subject: [PATCH 1/6] Move kedro-catalog JSON schema to kedro-datasets #4258 Signed-off-by: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> --- .../jsonschema/kedro-catalog-0.15.9.json | 1203 ++++++++++++++ .../static/jsonschema/kedro-catalog-0.16.json | 764 +++++++++ .../static/jsonschema/kedro-catalog-0.17.json | 951 +++++++++++ .../static/jsonschema/kedro-catalog-0.18.json | 1424 ++++++++++++++++ .../static/jsonschema/kedro-catalog-0.19.json | 1471 +++++++++++++++++ 5 files changed, 5813 insertions(+) create mode 100644 kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json create mode 100644 kedro-datasets/static/jsonschema/kedro-catalog-0.16.json create mode 100644 kedro-datasets/static/jsonschema/kedro-catalog-0.17.json create mode 100644 kedro-datasets/static/jsonschema/kedro-catalog-0.18.json create mode 100644 kedro-datasets/static/jsonschema/kedro-catalog-0.19.json diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json new file mode 100644 index 000000000..18a25576b --- /dev/null +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json @@ -0,0 +1,1203 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": ["type"], + "properties": { + "type": { + "type": "string", + "enum": [ + "networkx.NetworkXDataSet", + "dask.ParquetDataSet", + "biosequence.BioSequenceDataSet", + "matplotlib.MatplotlibWriter", + "yaml.YAMLDataSet", + "pickle.PickleDataSet", + "text.TextDataSet", + "spark.SparkJDBCDataSet", + "spark.SparkHiveDataSet", + "spark.SparkDataSet", + "pandas.JSONBlobDataSet", + "pandas.JSONDataSet", + "pandas.SQLTableDataSet", + "pandas.SQLQueryDataSet", + "pandas.ParquetDataSet", + "pandas.FeatherDataSet", + "pandas.CSVBlobDataSet", + "pandas.HDFDataSet", + "pandas.CSVDataSet", + "pandas.ExcelDataSet", + "pandas.GBQTableDataSet", + "PickleLocalDataSet", + "JSONLocalDataSet", + "HDFLocalDataSet", + "PartitionedDataSet", + "CachedDataSet", + "JSONDataSet", + "CSVHTTPDataSet", + "MemoryDataSet", + "CSVLocalDataSet", + "ExcelLocalDataSet", + "LambdaDataSet", + "HDFS3DataSet", + "PickleS3DataSet", + "SQLTableDataSet", + "SQLQueryDataSet", + "CSVS3DataSet", + "ParquetLocalDataSet", + "TextLocalDataSet" + ] + } + }, + "allOf": [ + { + "if": { + "properties": { "type": { "const": "networkx.NetworkXDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "The path to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`" + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "dask.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "biosequence.BioSequenceDataSet" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to sequence file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "matplotlib.MatplotlibWriter" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Key path to a matplot object file(s) prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'client_kwargs': {'aws_access_key_id': '', 'aws_secret_access_key': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "yaml.YAMLDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pickle.PickleDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a Pickle file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.loads\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.dumps\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "text.TextDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Load arguments should be specified in accordance with\nthe open function of the underlying filesystem. E.g. for local file\nhttps://docs.python.org/3/library/functions.html#open" + }, + "save_args": { + "type": "object", + "description": "Save arguments should be specified in accordance with\nthe open function of the underlying filesystem. E.g. for local file\nhttps://docs.python.org/3/library/functions.html#open" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkJDBCDataSet" } } + }, + "then": { + "required": ["url", "table"], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": "object", + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkHiveDataSet" } } + }, + "then": { + "required": ["database", "table", "write_mode"], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "spark.SparkDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": "object", + "description": "Credentials to access the S3 bucket, such as\n``aws_access_key_id``, ``aws_secret_access_key``, if ``filepath``\nprefix is ``s3a://`` or ``s3n://``. Optional keyword arguments passed to\n``hdfs.client.InsecureClient`` if ``filepath`` prefix is ``hdfs://``.\nIgnored otherwise." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.JSONBlobDataSet" } } + }, + "then": { + "required": ["filepath", "container_name", "credentials"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to an Azure Blob of a JSON file." + }, + "container_name": { + "type": "string", + "description": "Azure container name." + }, + "credentials": { + "type": "object", + "description": "Credentials (``account_name`` and\n``account_key`` or ``sas_token``) to access the Azure Blob Storage." + }, + "encoding": { + "type": "string", + "description": "Default utf-8. Defines encoding of JSON files downloaded as binary streams." + }, + "blob_from_bytes_args": { + "type": "object", + "description": "Any additional arguments to pass to Azure's\n``create_blob_from_bytes`` method:\nhttps://docs.microsoft.com/en-us/python/api/azure.storage.blob.blockblobservice.blockblobservice?view=azure-python#create-blob-from-bytes" + }, + "blob_to_bytes_args": { + "type": "object", + "description": "Any additional arguments to pass to Azure's\n``get_blob_to_bytes`` method:\nhttps://docs.microsoft.com/en-us/python/api/azure.storage.blob.baseblobservice.baseblobservice?view=azure-python#get-blob-to-bytes" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{project: 'my-project', ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLTableDataSet" } } + }, + "then": { + "required": ["table_name", "credentials"], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLQueryDataSet" } } + }, + "then": { + "required": ["sql", "credentials"], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a Parquet file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Parquet files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_parquet.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table`.\nHere you can find all available arguments:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.FeatherDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a feather file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.CSVBlobDataSet" } } + }, + "then": { + "required": ["filepath", "container_name", "credentials"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to an Azure Blob of a CSV file." + }, + "container_name": { + "type": "string", + "description": "Azure container name." + }, + "credentials": { + "type": "object", + "description": "Credentials (``account_name`` and\n``account_key`` or ``sas_token``) to access the Azure Blob Storage." + }, + "blob_to_text_args": { + "type": "object", + "description": "Any additional arguments to pass to Azure's\n``get_blob_to_text`` method:\nhttps://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.baseblobservice.baseblobservice?view=azure-python#get-blob-to-text" + }, + "blob_from_text_args": { + "type": "object", + "description": "Any additional arguments to pass to Azure's\n``create_blob_from_text`` method:\nhttps://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.baseblobservice.baseblobservice?view=azure-python#get-blob-to-text" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.HDFDataSet" } } }, + "then": { + "required": ["filepath", "key"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`" + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.CSVDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ExcelDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a Excel file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{\"project\": \"my-project\", ...}`." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.GBQTableDataSet" } } + }, + "then": { + "required": ["dataset", "table_name"], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + }, + "layer": { + "type": "string", + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "PickleLocalDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to a pkl file." + }, + "backend": { + "type": "string", + "description": "backend to use, must be one of ['pickle', 'joblib']." + }, + "load_args": { + "type": "object", + "description": "Options for loading pickle files. Refer to the help\nfile of ``pickle.load`` or ``joblib.load`` for options." + }, + "save_args": { + "type": "object", + "description": "Options for saving pickle files. Refer to the help\nfile of ``pickle.dump`` or ``joblib.dump`` for options." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "JSONLocalDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to a local json file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```json.load``.\nSee https://docs.python.org/3/library/json.html for details.\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```json.dump``.\nSee https://docs.python.org/3/library/json.html\nfor details. All defaults are preserved." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "HDFLocalDataSet" } } }, + "then": { + "required": ["filepath", "key"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to an hdf file." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading hdf files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_hdf.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving hdf files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_hdf.html\nAll defaults are preserved." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "PartitionedDataSet" } } }, + "then": { + "required": ["path", "dataset"], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": "object", + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.15.9/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "CachedDataSet" } } }, + "then": { + "required": ["dataset"], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro DataSet object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class.\nE.g. for ``GCSFileSystem`` class: `{project: 'my-project', ...}`" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "CSVHTTPDataSet" } } }, + "then": { + "required": ["fileurl"], + "properties": { + "fileurl": { + "type": "string", + "description": "A URL to fetch the CSV file." + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests.get`` accepts. Normally it's either\n``('login', 'password')``, or ``AuthBase`` instance for more complex cases." + }, + "load_args": { + "pattern": ".*", + "description": "Pandas options for loading csv files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "MemoryDataSet" } } }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "CSVLocalDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to a csv file." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading csv files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving csv files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "ExcelLocalDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to an Excel file." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nThe default_load_arg engine is 'xlrd', all others preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "LambdaDataSet" } } }, + "then": { + "required": ["load", "save"], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "HDFS3DataSet" } } }, + "then": { + "required": ["filepath", "key"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to an hdf file. May contain the full path in S3\nincluding bucket and protocol, e.g. `s3://bucket-name/path/to/file.hdf`." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "bucket_name": { + "type": "string", + "description": "S3 bucket name. Must be specified **only** if not\npresent in ``filepath``." + }, + "credentials": { + "type": "object", + "description": "Credentials to access the S3 bucket, such as\n``aws_access_key_id``, ``aws_secret_access_key``." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "s3fs_args": { + "type": "object", + "description": "S3FileSystem options. You can find all available arguments at:\nhttps://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "PickleS3DataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to a pkl file. May contain the full path in S3\nincluding bucket and protocol, e.g. `s3://bucket-name/path/to/file.pkl`." + }, + "bucket_name": { + "type": "string", + "description": "S3 bucket name. Must be specified **only** if not\npresent in ``filepath``." + }, + "credentials": { + "type": "object", + "description": "Credentials to access the S3 bucket, such as\n``aws_access_key_id``, ``aws_secret_access_key``." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nYou can find all available arguments at:\nhttps://docs.python.org/3/library/pickle.html#pickle.loads\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nYou can see all available arguments at:\nhttps://docs.python.org/3/library/pickle.html#pickle.dumps\nAll defaults are preserved." + }, + "s3fs_args": { + "type": "object", + "description": "S3FileSystem options. You can see all available arguments at:\nhttps://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "SQLTableDataSet" } } }, + "then": { + "required": ["table_name", "credentials"], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "SQLQueryDataSet" } } }, + "then": { + "required": ["sql", "credentials"], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "CSVS3DataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to a csv file. May contain the full path in S3\nincluding bucket and protocol, e.g. `s3://bucket-name/path/to/file.csv`." + }, + "bucket_name": { + "type": "string", + "description": "S3 bucket name. Must be specified **only** if not\npresent in ``filepath``." + }, + "credentials": { + "type": "object", + "description": "Credentials to access the S3 bucket, such as\n``aws_access_key_id``, ``aws_secret_access_key``." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading csv files.\nYou can find all available arguments at:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving csv files.\nYou can find all available arguments at:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "s3fs_args": { + "type": "object", + "description": "S3FileSystem options. You can see all available arguments at:\nhttps://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "ParquetLocalDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to a parquet file or a metadata file of a multipart\nparquet collection or the directory of a multipart parquet." + }, + "engine": { + "type": "string", + "description": "The engine to use, one of: `auto`, `fastparquet`,\n`pyarrow`. If `auto`, then the default behavior is to try\n`pyarrow`, falling back to `fastparquet` if `pyarrow` is\nunavailable." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `pyarrow`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html\nor `fastparquet`:\nhttps://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile.to_pandas" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas\nor `fastparquet`:\nhttps://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "TextLocalDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to a text file." + }, + "load_args": { + "type": "object", + "description": "Load arguments should be specified in accordance with\nthe built in open function. This can be found at\nhttps://docs.python.org/3/library/functions.html#open" + }, + "save_args": { + "type": "object", + "description": "Save arguments should be specified in accordance with\nthe built in open function. This can be found at\nhttps://docs.python.org/3/library/functions.html#open" + } + } + } + } + ] + } + } + } + \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json new file mode 100644 index 000000000..dc075716d --- /dev/null +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json @@ -0,0 +1,764 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": ["type"], + "properties": { + "type": { + "type": "string", + "enum": [ + "PartitionedDataSet", + "CachedDataSet", + "MemoryDataSet", + "LambdaDataSet", + "networkx.NetworkXDataSet", + "dask.ParquetDataSet", + "geopandas.GeoJSONDataSet", + "pillow.ImageDataSet", + "biosequence.BioSequenceDataSet", + "api.APIDataSet", + "matplotlib.MatplotlibWriter", + "yaml.YAMLDataSet", + "pickle.PickleDataSet", + "text.TextDataSet", + "spark.SparkJDBCDataSet", + "spark.SparkHiveDataSet", + "spark.SparkDataSet", + "pandas.JSONDataSet", + "pandas.SQLTableDataSet", + "pandas.SQLQueryDataSet", + "pandas.ParquetDataSet", + "pandas.FeatherDataSet", + "pandas.HDFDataSet", + "pandas.CSVDataSet", + "pandas.ExcelDataSet", + "pandas.GBQTableDataSet" + ] + } + }, + "allOf": [ + { + "if": { "properties": { "type": { "const": "PartitionedDataSet" } } }, + "then": { + "required": ["path", "dataset"], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": "object", + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.16.0/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "CachedDataSet" } } }, + "then": { + "required": ["dataset"], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro DataSet object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "MemoryDataSet" } } }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "LambdaDataSet" } } }, + "then": { + "required": ["load", "save"], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "networkx.NetworkXDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "The path to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "dask.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "geopandas.GeoJSONDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a GeoJSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" + }, + "save_args": { + "type": "object", + "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." + }, + "credentials": { + "type": "object", + "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pillow.ImageDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to an image file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "biosequence.BioSequenceDataSet" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "path to sequence file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "api.APIDataSet" } } }, + "then": { + "required": ["url"], + "properties": { + "url": { + "type": "string", + "description": "The API URL endpoint." + }, + "method": { + "type": "string", + "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." + }, + "data": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "params": { + "type": "object", + "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" + }, + "headers": { + "type": "object", + "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." + }, + "timeout": { + "type": "integer", + "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "matplotlib.MatplotlibWriter" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Key path to a matplot object file(s) prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'client_kwargs': {'aws_access_key_id': '', 'aws_secret_access_key': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "yaml.YAMLDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pickle.PickleDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a Pickle file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "text.TextDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkJDBCDataSet" } } + }, + "then": { + "required": ["url", "table"], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": "object", + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkHiveDataSet" } } + }, + "then": { + "required": ["database", "table", "write_mode"], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "spark.SparkDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Path to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": "object", + "description": "Credentials to access the S3 bucket, such as\n``aws_access_key_id``, ``aws_secret_access_key``, if ``filepath``\nprefix is ``s3a://`` or ``s3n://``. Optional keyword arguments passed to\n``hdfs.client.InsecureClient`` if ``filepath`` prefix is ``hdfs://``.\nIgnored otherwise." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLTableDataSet" } } + }, + "then": { + "required": ["table_name", "credentials"], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLQueryDataSet" } } + }, + "then": { + "required": ["sql", "credentials"], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a Parquet file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table`.\nHere you can find all available arguments:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.FeatherDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a feather file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.HDFDataSet" } } }, + "then": { + "required": ["filepath", "key"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.CSVDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ExcelDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath to a Excel file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.GBQTableDataSet" } } + }, + "then": { + "required": ["dataset", "table_name"], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + } + } + } + } + ] + } + } + } + \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json new file mode 100644 index 000000000..272e053da --- /dev/null +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json @@ -0,0 +1,951 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": ["type"], + "properties": { + "type": { + "type": "string", + "enum": [ + "PartitionedDataSet", + "CachedDataSet", + "MemoryDataSet", + "LambdaDataSet", + "networkx.NetworkXDataSet", + "dask.ParquetDataSet", + "geopandas.GeoJSONDataSet", + "pillow.ImageDataSet", + "json.JSONDataSet", + "biosequence.BioSequenceDataSet", + "tensorflow.TensorFlowModelDataset", + "api.APIDataSet", + "matplotlib.MatplotlibWriter", + "yaml.YAMLDataSet", + "pickle.PickleDataSet", + "text.TextDataSet", + "holoviews.HoloviewsWriter", + "email.EmailMessageDataSet", + "spark.SparkJDBCDataSet", + "spark.SparkHiveDataSet", + "spark.SparkDataSet", + "pandas.AppendableExcelDataSet", + "pandas.JSONDataSet", + "pandas.SQLTableDataSet", + "pandas.SQLQueryDataSet", + "pandas.ParquetDataSet", + "pandas.FeatherDataSet", + "pandas.HDFDataSet", + "pandas.CSVDataSet", + "pandas.ExcelDataSet", + "pandas.GBQTableDataSet", + "pandas.GBQQueryDataSet", + "pandas.GenericDataSet" + ] + } + }, + "allOf": [ + { + "if": { "properties": { "type": { "const": "PartitionedDataSet" } } }, + "then": { + "required": ["path", "dataset"], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": "object", + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.17.0/05_data/02_kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "CachedDataSet" } } }, + "then": { + "required": ["dataset"], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro DataSet object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "MemoryDataSet" } } }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "LambdaDataSet" } } }, + "then": { + "required": ["load", "save"], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "networkx.NetworkXDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "dask.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "geopandas.GeoJSONDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a GeoJSON file prefixed with a protocol like\n`s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" + }, + "save_args": { + "type": "object", + "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." + }, + "credentials": { + "type": "object", + "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pillow.ImageDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to an image file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "json.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "biosequence.BioSequenceDataSet" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to sequence file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "tensorflow.TensorFlowModelDataset" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a TensorFlow model directory prefixed with a\nprotocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)\nwill be used. The prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "TensorFlow options for loading models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/load_model\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "TensorFlow options for saving models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/save_model\nAll defaults are preserved, except for \"save_format\", which is set to \"tf\"." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "api.APIDataSet" } } }, + "then": { + "required": ["url"], + "properties": { + "url": { + "type": "string", + "description": "The API URL endpoint." + }, + "method": { + "type": "string", + "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." + }, + "data": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "params": { + "type": "object", + "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" + }, + "headers": { + "type": "object", + "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." + }, + "json": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests, passed in\nto the json kwarg in the requests object.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "timeout": { + "type": "integer", + "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + }, + "credentials": { + "pattern": ".*", + "description": "Same as ``auth``. Allows specifying ``auth`` secrets in \ncredentials.yml." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "matplotlib.MatplotlibWriter" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a matplot object file(s) prefixed with a protocol\nlike `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be\nused. The prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "yaml.YAMLDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pickle.PickleDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Pickle file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "text.TextDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "holoviews.HoloviewsWriter" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Extra save args passed to `holoviews.save()`. See\nhttp://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "email.EmailMessageDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "``email`` options for parsing email messages (arguments passed\ninto ``email.parser.Parser.parse``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser.parse\nIf you would like to specify options for the `Parser`,\nyou can include them under the \"parser\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser\nAll defaults are preserved, but \"policy\", which is set to ``email.policy.default``." + }, + "save_args": { + "type": "object", + "description": "``email`` options for generating MIME documents (arguments passed into\n``email.generator.Generator.flatten``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator.flatten\nIf you would like to specify options for the `Generator`,\nyou can include them under the \"generator\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkJDBCDataSet" } } + }, + "then": { + "required": ["url", "table"], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": "object", + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkHiveDataSet" } } + }, + "then": { + "required": ["database", "table", "write_mode"], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "spark.SparkDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": "object", + "description": "Credentials to access the S3 bucket, such as\n``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.\nOptional keyword arguments passed to ``hdfs.client.InsecureClient``\nif ``filepath`` prefix is ``hdfs://``. Ignored otherwise." + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "pandas.AppendableExcelDataSet" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to an existing local Excel file." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"openpyxl\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html\nNote: `mode` option of `ExcelWriter` is set to `a` and it can not be overridden." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLTableDataSet" } } + }, + "then": { + "required": ["table_name", "credentials"], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLQueryDataSet" } } + }, + "then": { + "required": ["sql", "credentials"], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": "object", + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Parquet file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table` and\n`pyarrow.Table.from_pandas`.\nHere you can find all available arguments for `write_table()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table\nThe arguments for `from_pandas()` should be passed through a nested\nkey: `from_pandas`. E.g.: `save_args = {\"from_pandas\": {\"preserve_index\": False}}`\nHere you can find all available arguments for `from_pandas()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.FeatherDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a feather file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.HDFDataSet" } } }, + "then": { + "required": ["filepath", "key"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.CSVDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.GenericDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "file_format" : { + "type": "string", + "description": "The read/write methods to retrieve from pandas (`pandas.read_{file_format}` or `pd.DataFrame.to_{file_format}`) on a best effort basis." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ExcelDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Excel file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under the \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.GBQTableDataSet" } } + }, + "then": { + "required": ["dataset", "table_name"], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + } + } + } + } + ] + } + } + } + \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json new file mode 100644 index 000000000..f3b81a46a --- /dev/null +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json @@ -0,0 +1,1424 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": [ + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "CachedDataSet", + "IncrementalDataSet", + "MemoryDataSet", + "LambdaDataSet", + "PartitionedDataSet", + "api.APIDataSet", + "biosequence.BioSequenceDataSet", + "dask.ParquetDataSet", + "email.EmailMessageDataSet", + "geopandas.GeoJSONDataSet", + "holoviews.HoloviewsWriter", + "json.JSONDataSet", + "matplotlib.MatplotlibWriter", + "networkx.NetworkXDataSet", + "pandas.CSVDataSet", + "pandas.ExcelDataSet", + "pandas.FeatherDataSet", + "pandas.GBQTableDataSet", + "pandas.HDFDataSet", + "pandas.JSONDataSet", + "pandas.ParquetDataSet", + "pandas.SQLTableDataSet", + "pandas.SQLQueryDataSet", + "pandas.XMLDataSet", + "pillow.ImageDataSet", + "pickle.PickleDataSet", + "plotly.PlotlyDataSet", + "redis.PickleDataSet", + "spark.SparkDataSet", + "spark.SparkHiveDataSet", + "spark.SparkJDBCDataSet", + "tensorflow.TensorFlowModelDataset", + "text.TextDataSet", + "tracking.JSONDataSet", + "tracking.MetricsDataSet", + "yaml.YAMLDataSet" + ] + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "CachedDataSet" + } + } + }, + "then": { + "required": [ + "dataset" + ], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro DataSet object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "IncrementalDataSet" + } + } + }, + "then": { + "required": [ + "path", + "dataset" + ], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "checkpoint": { + "pattern": "object", + "description": "Optional checkpoint configuration. Accepts a dictionary\nwith the corresponding dataset definition including ``filepath``\n(unlike ``dataset`` argument). Checkpoint configuration is\ndescribed here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#checkpoint-configuration\nCredentials for the checkpoint can be explicitly specified\nin this configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "MemoryDataSet" + } + } + }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "LambdaDataSet" + } + } + }, + "then": { + "required": [ + "load", + "save" + ], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "PartitionedDataSet" + } + } + }, + "then": { + "required": [ + "path", + "dataset" + ], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "api.APIDataSet" + } + } + }, + "then": { + "required": [ + "url" + ], + "properties": { + "url": { + "type": "string", + "description": "The API URL endpoint." + }, + "method": { + "type": "string", + "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." + }, + "data": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "params": { + "type": "object", + "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" + }, + "headers": { + "type": "object", + "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." + }, + "json": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests, passed in\nto the json kwarg in the requests object.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "timeout": { + "type": "integer", + "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "biosequence.BioSequenceDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to sequence file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "dask.ParquetDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "email.EmailMessageDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "``email`` options for parsing email messages (arguments passed\ninto ``email.parser.Parser.parse``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser.parse\nIf you would like to specify options for the `Parser`,\nyou can include them under the \"parser\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser\nAll defaults are preserved, but \"policy\", which is set to ``email.policy.default``." + }, + "save_args": { + "type": "object", + "description": "``email`` options for generating MIME documents (arguments passed into\n``email.generator.Generator.flatten``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator.flatten\nIf you would like to specify options for the `Generator`,\nyou can include them under the \"generator\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "geopandas.GeoJSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a GeoJSON file prefixed with a protocol like\n`s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" + }, + "save_args": { + "type": "object", + "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "holoviews.HoloviewsWriter" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Extra save args passed to `holoviews.save()`. See\nhttps://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "json.JSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "matplotlib.MatplotlibWriter" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a matplot object file(s) prefixed with a protocol\nlike `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be\nused. The prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "networkx.NetworkXDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.CSVDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.ExcelDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Excel file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under the \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.FeatherDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a feather file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.GBQTableDataSet" + } + } + }, + "then": { + "required": [ + "dataset", + "table_name" + ], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.HDFDataSet" + } + } + }, + "then": { + "required": [ + "filepath", + "key" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.JSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.ParquetDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Parquet file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table` and\n`pyarrow.Table.from_pandas`.\nHere you can find all available arguments for `write_table()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table\nThe arguments for `from_pandas()` should be passed through a nested\nkey: `from_pandas`. E.g.: `save_args = {\"from_pandas\": {\"preserve_index\": False}}`\nHere you can find all available arguments for `from_pandas()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.SQLTableDataSet" + } + } + }, + "then": { + "required": [ + "table_name", + "credentials" + ], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.SQLQueryDataSet" + } + } + }, + "then": { + "required": [ + "sql", + "credentials" + ], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "execution_options": { + "type": "object", + "description": "A dictionary with non-SQL options for the connection\nto be applied to the underlying engine.\nTo find all supported execution options, see here:\nhttps://docs.sqlalchemy.org/en/12/core/connections.html#sqlalchemy.engine.Connection.execution_options \nNote that this is not a standard argument supported by pandas API, but could be useful for handling large datasets." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.XMLDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a XML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_xml.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_xml.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pickle.PickleDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Pickle file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pillow.ImageDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to an image file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "plotly.PlotlyDataSet" + } + } + }, + "then": { + "required": [ + "filepath", + "plotly_args" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "plotly_args": { + "type": "object", + "description": "Plotly configuration for generating a plotly graph object Figure\nrepresenting the plotted data." + }, + "load_args": { + "type": "object", + "description": "Plotly options for loading JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.from_json.html#plotly.io.from_json\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Plotly options for saving JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.write_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "redis.PickleDataSet" + } + } + }, + "then": { + "required": [ + "key" + ], + "properties": { + "key": { + "type": "string", + "description": "The key to use for saving/loading object to Redis." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be an import path to a module which satisfies the ``pickle`` interface.\nThat is, contains a `loads` and `dumps` function. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.loads\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.dumps\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the redis server." + }, + "redis_args": { + "type": "object", + "description": "Extra arguments to pass into the redis client constructor ``redis.StrictRedis.from_url``, as well as to pass to the ``redis.StrictRedis.set``" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials to access the S3 bucket, such as\n``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.\nOptional keyword arguments passed to ``hdfs.client.InsecureClient``\nif ``filepath`` prefix is ``hdfs://``. Ignored otherwise." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkHiveDataSet" + } + } + }, + "then": { + "required": [ + "database", + "table", + "write_mode" + ], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkJDBCDataSet" + } + } + }, + "then": { + "required": [ + "url", + "table" + ], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tensorflow.TensorFlowModelDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a TensorFlow model directory prefixed with a\nprotocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)\nwill be used. The prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "TensorFlow options for loading models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/load_model\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "TensorFlow options for saving models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/save_model\nAll defaults are preserved, except for \"save_format\", which is set to \"tf\"." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "text.TextDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tracking.JSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tracking.MetricsDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "yaml.YAMLDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + } + ] + } + } + } + \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json new file mode 100644 index 000000000..fa5645084 --- /dev/null +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json @@ -0,0 +1,1471 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": [ + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "CachedDataset", + "IncrementalDataset", + "MemoryDataset", + "LambdaDataset", + "partitions.PartitionedDataset", + "api.APIDataset", + "biosequence.BioSequenceDataset", + "dask.ParquetDataset", + "email.EmailMessageDataset", + "geopandas.GeoJSONDataset", + "holoviews.HoloviewsWriter", + "huggingface.HFDataset", + "huggingface.HFTransformerPipelineDataset", + "json.JSONDataset", + "matplotlib.MatplotlibWriter", + "networkx.NetworkXDataset", + "pandas.CSVDataset", + "pandas.ExcelDataset", + "pandas.FeatherDataset", + "pandas.GBQTableDataset", + "pandas.HDFDataset", + "pandas.JSONDataset", + "pandas.ParquetDataset", + "pandas.SQLTableDataset", + "pandas.SQLQueryDataset", + "pandas.XMLDataset", + "pillow.ImageDataset", + "pickle.PickleDataset", + "plotly.PlotlyDataset", + "redis.PickleDataset", + "spark.SparkDataset", + "spark.SparkHiveDataset", + "spark.SparkJDBCDataset", + "tensorflow.TensorFlowModelDataset", + "text.TextDataset", + "tracking.JSONDataset", + "tracking.MetricsDataset", + "yaml.YAMLDataset" + ] + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "CachedDataset" + } + } + }, + "then": { + "required": [ + "dataset" + ], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro Dataset object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "IncrementalDataset" + } + } + }, + "then": { + "required": [ + "path", + "dataset" + ], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataset``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataset``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "checkpoint": { + "pattern": "object", + "description": "Optional checkpoint configuration. Accepts a dictionary\nwith the corresponding dataset definition including ``filepath``\n(unlike ``dataset`` argument). Checkpoint configuration is\ndescribed here:\nhttps://kedro.readthedocs.io/en/0.19.0/data/kedro_io.html#checkpoint-configuration\nCredentials for the checkpoint can be explicitly specified\nin this configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.19.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "MemoryDataset" + } + } + }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "LambdaDataset" + } + } + }, + "then": { + "required": [ + "load", + "save" + ], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "PartitionedDataset" + } + } + }, + "then": { + "required": [ + "path", + "dataset" + ], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataset``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataset``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.19.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "api.APIDataset" + } + } + }, + "then": { + "required": [ + "url" + ], + "properties": { + "url": { + "type": "string", + "description": "The API URL endpoint." + }, + "method": { + "type": "string", + "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." + }, + "data": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "params": { + "type": "object", + "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" + }, + "headers": { + "type": "object", + "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." + }, + "json": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests, passed in\nto the json kwarg in the requests object.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "timeout": { + "type": "integer", + "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "biosequence.BioSequenceDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to sequence file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "dask.ParquetDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "email.EmailMessageDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "``email`` options for parsing email messages (arguments passed\ninto ``email.parser.Parser.parse``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser.parse\nIf you would like to specify options for the `Parser`,\nyou can include them under the \"parser\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser\nAll defaults are preserved, but \"policy\", which is set to ``email.policy.default``." + }, + "save_args": { + "type": "object", + "description": "``email`` options for generating MIME documents (arguments passed into\n``email.generator.Generator.flatten``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator.flatten\nIf you would like to specify options for the `Generator`,\nyou can include them under the \"generator\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "geopandas.GeoJSONDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a GeoJSON file prefixed with a protocol like\n`s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" + }, + "save_args": { + "type": "object", + "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "holoviews.HoloviewsWriter" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Extra save args passed to `holoviews.save()`. See\nhttps://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "huggingface.HFDataset" + } + } + }, + "then": { + "required": [ + "dataset_name" + ], + "properties": { + "dataset_name": { + "type": "string", + "description": "Huggingface dataset name" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "huggingface.HFTransformerPipelineDataset" + } + } + }, + "then": { + "properties": { + "task": { + "type": "string", + "description": "Huggingface pipeline task name" + }, + "model_name": { + "type": "string", + "description": "Huggingface model name" + }, + "pipeline_kwargs": { + "type": "object", + "description": "Additional kwargs to be passed into the pipeline" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "json.JSONDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "matplotlib.MatplotlibWriter" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a matplot object file(s) prefixed with a protocol\nlike `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be\nused. The prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "networkx.NetworkXDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.CSVDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.ExcelDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Excel file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under the \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.FeatherDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a feather file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.GBQTableDataset" + } + } + }, + "then": { + "required": [ + "dataset", + "table_name" + ], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.HDFDataset" + } + } + }, + "then": { + "required": [ + "filepath", + "key" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.JSONDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.ParquetDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Parquet file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table` and\n`pyarrow.Table.from_pandas`.\nHere you can find all available arguments for `write_table()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table\nThe arguments for `from_pandas()` should be passed through a nested\nkey: `from_pandas`. E.g.: `save_args = {\"from_pandas\": {\"preserve_index\": False}}`\nHere you can find all available arguments for `from_pandas()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.SQLTableDataset" + } + } + }, + "then": { + "required": [ + "table_name", + "credentials" + ], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.SQLQueryDataset" + } + } + }, + "then": { + "required": [ + "sql", + "credentials" + ], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "execution_options": { + "type": "object", + "description": "A dictionary with non-SQL options for the connection\nto be applied to the underlying engine.\nTo find all supported execution options, see here:\nhttps://docs.sqlalchemy.org/en/12/core/connections.html#sqlalchemy.engine.Connection.execution_options \nNote that this is not a standard argument supported by pandas API, but could be useful for handling large datasets." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.XMLDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a XML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_xml.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_xml.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pickle.PickleDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Pickle file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pillow.ImageDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to an image file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "plotly.PlotlyDataset" + } + } + }, + "then": { + "required": [ + "filepath", + "plotly_args" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "plotly_args": { + "type": "object", + "description": "Plotly configuration for generating a plotly graph object Figure\nrepresenting the plotted data." + }, + "load_args": { + "type": "object", + "description": "Plotly options for loading JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.from_json.html#plotly.io.from_json\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Plotly options for saving JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.write_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "redis.PickleDataset" + } + } + }, + "then": { + "required": [ + "key" + ], + "properties": { + "key": { + "type": "string", + "description": "The key to use for saving/loading object to Redis." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be an import path to a module which satisfies the ``pickle`` interface.\nThat is, contains a `loads` and `dumps` function. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.loads\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.dumps\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the redis server." + }, + "redis_args": { + "type": "object", + "description": "Extra arguments to pass into the redis client constructor ``redis.StrictRedis.from_url``, as well as to pass to the ``redis.StrictRedis.set``" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataset``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials to access the S3 bucket, such as\n``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.\nOptional keyword arguments passed to ``hdfs.client.InsecureClient``\nif ``filepath`` prefix is ``hdfs://``. Ignored otherwise." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkHiveDataset" + } + } + }, + "then": { + "required": [ + "database", + "table", + "write_mode" + ], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkJDBCDataset" + } + } + }, + "then": { + "required": [ + "url", + "table" + ], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tensorflow.TensorFlowModelDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a TensorFlow model directory prefixed with a\nprotocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)\nwill be used. The prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "TensorFlow options for loading models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/load_model\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "TensorFlow options for saving models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/save_model\nAll defaults are preserved, except for \"save_format\", which is set to \"tf\"." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "text.TextDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tracking.JSONDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tracking.MetricsDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "yaml.YAMLDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + } + ] + } + } + } + \ No newline at end of file From b6b2fb2272aab9e6737dc993e228eca49898b780 Mon Sep 17 00:00:00 2001 From: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> Date: Sun, 1 Dec 2024 04:56:49 +0000 Subject: [PATCH 2/6] #4258 Update PR template Signed-off-by: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6723cb70f..700653249 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -8,6 +8,7 @@ - [ ] Opened this PR as a 'Draft Pull Request' if it is work-in-progress - [ ] Updated the documentation to reflect the code changes +- [ ] Updated `jsonschema/kedro-catalog-X.XX.json` if necessary - [ ] Added a description of this change in the relevant `RELEASE.md` file - [ ] Added tests to cover my changes - [ ] Received approvals from at least half of the TSC (required for adding a new, non-experimental dataset) From cf5bdb6c68af178f84ec812afa5728d085f606e2 Mon Sep 17 00:00:00 2001 From: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> Date: Sun, 1 Dec 2024 18:22:48 +0000 Subject: [PATCH 3/6] Trim trailing whitespace with pre-commit Signed-off-by: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> --- kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json | 1 - kedro-datasets/static/jsonschema/kedro-catalog-0.16.json | 1 - kedro-datasets/static/jsonschema/kedro-catalog-0.17.json | 1 - kedro-datasets/static/jsonschema/kedro-catalog-0.18.json | 1 - kedro-datasets/static/jsonschema/kedro-catalog-0.19.json | 1 - 5 files changed, 5 deletions(-) diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json index 18a25576b..5cd06d738 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.15.9.json @@ -1200,4 +1200,3 @@ } } } - \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json index dc075716d..7d6b7cb5e 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.16.json @@ -761,4 +761,3 @@ } } } - \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json index 272e053da..7facbd8b6 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.17.json @@ -948,4 +948,3 @@ } } } - \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json index f3b81a46a..195f0234a 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json @@ -1421,4 +1421,3 @@ } } } - \ No newline at end of file diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json index fa5645084..d5b608d50 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json @@ -1468,4 +1468,3 @@ } } } - \ No newline at end of file From 4f195dc6abf60a8ef8c5da4c4f29fb71a1e1c97d Mon Sep 17 00:00:00 2001 From: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> Date: Wed, 4 Dec 2024 02:32:36 +0000 Subject: [PATCH 4/6] Clarify that is now in Signed-off-by: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> --- kedro-datasets/CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/CONTRIBUTING.md b/kedro-datasets/CONTRIBUTING.md index 8ef5ae52c..c8652c678 100644 --- a/kedro-datasets/CONTRIBUTING.md +++ b/kedro-datasets/CONTRIBUTING.md @@ -27,7 +27,7 @@ If you have new ideas for Kedro-Datasets then please open a [GitHub issue](https If you're unsure where to begin contributing to Kedro-Datasets, please start by looking through the `good first issue` and `help wanted` on [GitHub](https://github.com/kedro-org/kedro-plugins/issues). If you want to contribute a new dataset, read the [tutorial to create and contribute a custom dataset](https://docs.kedro.org/en/stable/data/how_to_create_a_custom_dataset.html) in the Kedro documentation. -Make sure to add the new dataset to `kedro_datasets.rst` so that it shows up in the API documentation and to `static/jsonschema/kedro-catalog-X.json` for IDE validation. +Make sure to add the new dataset to `kedro_datasets.rst` so that it shows up in the API documentation and to `kedro-datasets/static/jsonschema/kedro-catalog-X.json` for IDE validation. Below is a guide to help you understand the process of contributing a new dataset, whether it falls under the category of core or experimental datasets. From b84d1a104bca49a389cf9bfd2fe99fb3f0aeec7a Mon Sep 17 00:00:00 2001 From: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> Date: Wed, 11 Dec 2024 03:31:19 +0000 Subject: [PATCH 5/6] Remove schemas for CachedDataset, MemoryDataset, and LambdaDataset since they remain in Kedro core Signed-off-by: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> --- .../static/jsonschema/kedro-catalog-0.19.json | 84 +------------------ 1 file changed, 1 insertion(+), 83 deletions(-) diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json index d5b608d50..f19266812 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json @@ -9,10 +9,7 @@ "type": { "type": "string", "enum": [ - "CachedDataset", - "IncrementalDataset", - "MemoryDataset", - "LambdaDataset", + "partitions.IncrementalDataset", "partitions.PartitionedDataset", "api.APIDataset", "biosequence.BioSequenceDataset", @@ -51,30 +48,6 @@ } }, "allOf": [ - { - "if": { - "properties": { - "type": { - "const": "CachedDataset" - } - } - }, - "then": { - "required": [ - "dataset" - ], - "properties": { - "dataset": { - "pattern": ".*", - "description": "A Kedro Dataset object or a dictionary to cache." - }, - "copy_mode": { - "type": "string", - "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." - } - } - } - }, { "if": { "properties": { @@ -127,61 +100,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "MemoryDataset" - } - } - }, - "then": { - "required": [], - "properties": { - "data": { - "pattern": ".*", - "description": "Python object containing the data." - }, - "copy_mode": { - "type": "string", - "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "LambdaDataset" - } - } - }, - "then": { - "required": [ - "load", - "save" - ], - "properties": { - "load": { - "pattern": ".*", - "description": "Method to load data from a data set." - }, - "save": { - "pattern": ".*", - "description": "Method to save data to a data set." - }, - "exists": { - "pattern": ".*", - "description": "Method to check whether output data already exists." - }, - "release": { - "pattern": ".*", - "description": "Method to release any cached information." - } - } - } - }, { "if": { "properties": { From 7ebb8faedb7ae19f316f06402298f7ac35fb6219 Mon Sep 17 00:00:00 2001 From: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> Date: Thu, 12 Dec 2024 02:27:08 +0000 Subject: [PATCH 6/6] Update release notes Signed-off-by: Chris Schopp <56572144+chrisschopp@users.noreply.github.com> --- kedro-datasets/RELEASE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index ee05edb82..d7ba58b7e 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -18,6 +18,7 @@ - Implemented Snowflake's [local testing framework](https://docs.snowflake.com/en/developer-guide/snowpark/python/testing-locally) for testing purposes. - Improved the dependency management for Spark-based datasets by refactoring the Spark and Databricks utility functions used across the datasets. - Added deprecation warning for `tracking.MetricsDataset` and `tracking.JSONDataset`. +- Moved `kedro-catalog` JSON schemas from Kedro core to `kedro-datasets`. ## Breaking Changes @@ -30,6 +31,7 @@ Many thanks to the following Kedroids for contributing PRs to this release: - [Thomas d'Hooghe](https://github.com/tdhooghe) - [Minura Punchihewa](https://github.com/MinuraPunchihewa) - [Mark Druffel](https://github.com/mark-druffel) +- [Chris Schopp](https://github.com/chrisschopp) # Release 5.1.0