Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ankatiyar authored Sep 24, 2024
2 parents 49d3a88 + 4b75db7 commit 38f3338
Show file tree
Hide file tree
Showing 23 changed files with 507 additions and 176 deletions.
28 changes: 10 additions & 18 deletions .gitpod.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart
image: gitpod/workspace-python-3.10:2023-04-20-16-32-37

image: gitpod/workspace-python-3.11

tasks:
# We want packages installed during the pre-build init steps to go to /workspace
Expand All @@ -12,22 +10,16 @@ tasks:
echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no
init: |
make sign-off
pip install uv
uv venv
echo source .venv/bin/activate >> ~/.bashrc
source ~/.bashrc
make install-test-requirements plugin=kedro-datasets
command: |
pre-commit install --install-hooks
clear
github:
prebuilds:
# enable for the master/default branch (defaults to true)
master: true
# enable for all branches in this repo (defaults to false)
branches: true
# enable for pull requests coming from this repo (defaults to true)
pullRequests: true
# enable for pull requests coming from forks (defaults to false)
pullRequestsFromForks: true
# add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
addComment: false
# add a "Review in Gitpod" button to pull requests (defaults to false)
addBadge: true
- name: system
init: |
sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 make
sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils
68 changes: 26 additions & 42 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,6 @@ package:
rm -Rf dist;\
python -m build

pypi:
python -m pip install twine -U
python -m twine upload $(plugin)/dist/*

install: package
cd $(plugin) && pip install -U dist/*.whl

install-pip-setuptools:
python -m pip install -U pip setuptools wheel

Expand All @@ -25,46 +18,14 @@ mypy:
test:
cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile

# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
dataset-tests: dataset-doctests
cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov

extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
extra_pytest_args=
dataset-doctest%:
if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
echo "make: *** No rule to make target \`${@}\`. Stop."; \
exit 2; \
fi; \
\
# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
--ignore kedro_datasets/pandas/gbq_dataset.py \
--ignore kedro_datasets/partitions/partitioned_dataset.py \
--ignore kedro_datasets/redis/redis_dataset.py \
--ignore kedro_datasets/snowflake/snowpark_dataset.py \
--ignore kedro_datasets/spark/spark_hive_dataset.py \
--ignore kedro_datasets/spark/spark_jdbc_dataset.py \
$(extra_pytest_arg${*})

test-sequential:
cd $(plugin) && pytest tests --cov-config pyproject.toml

e2e-tests:
cd $(plugin) && behave

secret-scan:
trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt .

clean:
cd $(plugin);\
rm -rf build dist pip-wheel-metadata .pytest_cache;\
find . -regex ".*/__pycache__" -exec rm -rf {} +;\
find . -regex ".*\.egg-info" -exec rm -rf {} +;\

install-test-requirements:
cd $(plugin) && pip install ".[test]"
cd $(plugin) && uv pip install ".[test]"

install-pre-commit:
pre-commit install --install-hooks
Expand All @@ -79,12 +40,12 @@ sign-off:
echo '--in-place "$$1"' >> .git/hooks/commit-msg
chmod +x .git/hooks/commit-msg

## kedro-datasets specific

# kedro-datasets related only
test-no-spark: dataset-doctests-no-spark
cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile

test-no-spark-sequential: dataset-doctests-no-spark
cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks

# kedro-datasets/snowflake tests skipped from default scope
test-snowflake-only:
Expand All @@ -93,3 +54,26 @@ test-snowflake-only:

check-datasets-docs:
cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck

# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
dataset-tests: dataset-doctests
cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov

extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
extra_pytest_args=
dataset-doctest%:
if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
echo "make: *** No rule to make target \`${@}\`. Stop."; \
exit 2; \
fi; \
\
# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
--ignore kedro_datasets/pandas/gbq_dataset.py \
--ignore kedro_datasets/partitions/partitioned_dataset.py \
--ignore kedro_datasets/redis/redis_dataset.py \
--ignore kedro_datasets/snowflake/snowpark_dataset.py \
--ignore kedro_datasets/spark/spark_hive_dataset.py \
--ignore kedro_datasets/spark/spark_jdbc_dataset.py \
$(extra_pytest_arg${*})
28 changes: 19 additions & 9 deletions kedro-datasets/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ api = ["kedro-datasets[api-apidataset]"]
biosequence-biosequencedataset = ["biopython~=1.73"]
biosequence = ["kedro-datasets[biosequence-biosequencedataset]"]

dask-csvdataset = ["dask[dataframe]>=2021.10"]
dask-parquetdataset = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"]
dask = ["kedro-datasets[dask-parquetdataset]"]
dask = ["kedro-datasets[dask-parquetdataset, dask-csvdataset]"]

databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base,hdfs-base,s3fs-base]"]
databricks = ["kedro-datasets[databricks-managedtabledataset]"]
Expand Down Expand Up @@ -92,7 +93,7 @@ pandas-featherdataset = ["kedro-datasets[pandas-base]"]
pandas-gbqtabledataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"]
pandas-gbqquerydataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"]
pandas-genericdataset = ["kedro-datasets[pandas-base]"]
pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables~=3.6"]
pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables>=3.6"]
pandas-jsondataset = ["kedro-datasets[pandas-base]"]
pandas-parquetdataset = ["kedro-datasets[pandas-base]", "pyarrow>=6.0"]
pandas-sqltabledataset = ["kedro-datasets[pandas-base]", "SQLAlchemy>=1.4, <3.0"]
Expand Down Expand Up @@ -127,9 +128,12 @@ plotly = ["kedro-datasets[plotly-htmldataset,plotly-jsondataset,plotly-plotlydat

polars-csvdataset = ["kedro-datasets[polars-base]"]
polars-eagerpolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"]
polars-genericdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"]
polars-lazypolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "deltalake >= 0.6.2"]
polars = ["kedro-datasets[polars-genericdataset]"]
polars = [
"""kedro-datasets[polars-csvdataset,\
polars-eagerpolarsdataset,\
polars-lazypolarsdataset]"""
]

redis-pickledataset = ["redis~=4.1"]
redis = ["kedro-datasets[redis-pickledataset]"]
Expand All @@ -140,8 +144,15 @@ snowflake = ["kedro-datasets[snowflake-snowparktabledataset]"]
spark-deltatabledataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base,delta-base]"]
spark-sparkdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
spark-sparkhivedataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
spark-sparkjdbcdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
spark = ["kedro-datasets[spark-deltatabledataset]"]
spark-sparkjdbcdataset = ["kedro-datasets[spark-base]"]
spark-sparkstreamingdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
spark = [
"""kedro-datasets[spark-deltatabledataset,\
spark-sparkdataset,\
spark-sparkhivedataset,\
spark-sparkjdbcdataset,\
spark-sparkstreamingdataset]"""
]

svmlight-svmlightdataset = ["scikit-learn>=1.0.2", "scipy~=1.7.3"]
svmlight = ["kedro-datasets[svmlight-svmlightdataset]"]
Expand Down Expand Up @@ -211,7 +222,7 @@ test = [
"ibis-framework[duckdb,examples]",
"import-linter[toml]==1.2.6",
"ipython>=7.31.1, <8.0",
"Jinja2<3.1.0",
"Jinja2<3.2.0",
"joblib>=0.14",
"jupyterlab>=3.0",
"jupyter~=1.0",
Expand Down Expand Up @@ -250,8 +261,7 @@ test = [
"scipy>=1.7.3",
"packaging",
"SQLAlchemy>=1.2",
"tables>=3.8.0; platform_system == 'Windows'", # Import issues with python 3.8 with pytables pinning to 3.8.0 fixes this https://github.com/PyTables/PyTables/issues/933#issuecomment-1555917593
"tables~=3.6; platform_system != 'Windows'",
"tables>=3.6",
"tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
"tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'",
"triad>=0.6.7, <1.0",
Expand Down
2 changes: 1 addition & 1 deletion kedro-telemetry/kedro_telemetry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

import logging

logging.getLogger(__name__).setLevel(logging.INFO)
logging.getLogger(__name__).setLevel(logging.DEBUG)
27 changes: 10 additions & 17 deletions kedro-telemetry/kedro_telemetry/masking.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Module containing command masking functionality."""
from __future__ import annotations

from typing import Any, Iterator
from typing import Any

import click

Expand Down Expand Up @@ -81,16 +81,19 @@ def _get_cli_structure(
return output


def _mask_kedro_cli(
cli_struct: dict[str | None, Any], command_args: list[str]
) -> list[str]:
def _mask_kedro_cli(cli: click.CommandCollection, command_args: list[str]) -> list[str]:
"""Takes a dynamic vocabulary (based on `KedroCLI`) and returns
a masked CLI input"""
output = []

# Preserve the initial part of the command until parameters sections begin
arg_index = 0
current_CLI = cli_struct.get("kedro", {})
cmd = command_args[0] if command_args else ""
if cmd in {"--help", "--version", "-h", "-v", ""}:
return command_args
click_cmd = cli.get_command(ctx=None, cmd_name=cmd) # type: ignore
if click_cmd is None:
return [MASK]

current_CLI = _get_cli_structure(click_cmd)
while (
arg_index < len(command_args)
and not command_args[arg_index].startswith("-")
Expand All @@ -116,13 +119,3 @@ def _mask_kedro_cli(
output.append(MASK)

return output


def _recursive_items(dictionary: dict[Any, Any]) -> Iterator[Any]:
for key, value in dictionary.items():
if isinstance(value, dict):
yield key
yield from _recursive_items(value)
else:
yield key
yield value
29 changes: 14 additions & 15 deletions kedro-telemetry/kedro_telemetry/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from kedro.pipeline import Pipeline

from kedro_telemetry import __version__ as TELEMETRY_VERSION
from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli
from kedro_telemetry.masking import _mask_kedro_cli

HEAP_APPID_PROD = "2388822444"
HEAP_ENDPOINT = "https://heapanalytics.com/api/track"
Expand All @@ -49,6 +49,7 @@
CONFIG_FILENAME = "telemetry.toml"
PYPROJECT_CONFIG_NAME = "pyproject.toml"
UNDEFINED_PACKAGE_NAME = "undefined_package_name"
MISSING_USER_IDENTITY = "missing_user_identity"

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -78,7 +79,7 @@ def _get_or_create_uuid() -> str:
return new_uuid

except Exception as e:
logging.error(f"Failed to retrieve UUID: {e}")
logging.debug(f"Failed to retrieve UUID: {e}")
return ""


Expand All @@ -104,7 +105,7 @@ def _get_or_create_project_id(pyproject_path: Path) -> str | None:
file.write(toml_string)
return project_id
except KeyError:
logging.error(
logging.debug(
f"Failed to retrieve project id or save project id: "
f"{str(pyproject_path)} does not contain a [tool.kedro] section"
)
Expand Down Expand Up @@ -148,7 +149,7 @@ def _generate_new_uuid(full_path: str) -> str:

return new_uuid
except Exception as e:
logging.error(f"Failed to create UUID: {e}")
logging.debug(f"Failed to create UUID: {e}")
return ""


Expand Down Expand Up @@ -176,10 +177,7 @@ def before_command_run(

# get KedroCLI and its structure from actual project root
cli = KedroCLI(project_path=project_path if project_path else Path.cwd())
cli_struct = _get_cli_structure(cli_obj=cli, get_help=False)
masked_command_args = _mask_kedro_cli(
cli_struct=cli_struct, command_args=command_args
)
masked_command_args = _mask_kedro_cli(cli, command_args=command_args)

self._user_uuid = _get_or_create_uuid()

Expand All @@ -200,13 +198,15 @@ def after_command_run(self):

@hook_impl
def after_context_created(self, context):
"""Hook implementation to send project statistics data to Heap"""
"""Hook implementation to read metadata"""

self._consent = _check_for_telemetry_consent(context.project_path)
self._project_path = context.project_path

@hook_impl
def after_catalog_created(self, catalog):
"""Hook implementation to send project statistics data to Heap"""

if self._consent is False:
return

Expand Down Expand Up @@ -241,12 +241,12 @@ def _send_telemetry_heap_event(self, event_name: str):
try:
_send_heap_event(
event_name=event_name,
identity=self._user_uuid,
identity=self._user_uuid if self._user_uuid else MISSING_USER_IDENTITY,
properties=self._event_properties,
)
self._sent = True
except Exception as exc:
logger.warning(
logger.debug(
"Something went wrong in hook implementation to send command run data to Heap. "
"Exception: %s",
exc,
Expand Down Expand Up @@ -324,22 +324,21 @@ def _send_heap_event(
"event": event_name,
"timestamp": datetime.now().strftime(TIMESTAMP_FORMAT),
"properties": properties or {},
"identity": identity,
}
if identity:
data["identity"] = identity

try:
resp = requests.post(
url=HEAP_ENDPOINT, headers=HEAP_HEADERS, data=json.dumps(data), timeout=10
)
if resp.status_code != 200: # noqa: PLR2004
logger.warning(
logger.debug(
"Failed to send data to Heap. Response code returned: %s, Response reason: %s",
resp.status_code,
resp.reason,
)
except requests.exceptions.RequestException as exc:
logger.warning(
logger.debug(
"Failed to send data to Heap. Exception of type '%s' was raised.",
type(exc).__name__,
)
Expand Down
Loading

0 comments on commit 38f3338

Please sign in to comment.