From cd706bbcf2cee807d3f8382aaedbfac1f114a797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Courivaud?= Date: Sat, 11 Jan 2025 12:14:45 +0100 Subject: [PATCH] setup mother duck instead of local duckdb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raphaƫl Courivaud --- analytics/dagster/logs/event.log | 3 --- .../dagster/src/assets/dwh/setup_duckdb.py | 22 ++++++++++++------- analytics/dagster/src/config.py | 5 +++++ analytics/dagster/src/definitions.py | 6 ++--- analytics/dbt/profiles.yml | 2 +- 5 files changed, 23 insertions(+), 15 deletions(-) delete mode 100644 analytics/dagster/logs/event.log diff --git a/analytics/dagster/logs/event.log b/analytics/dagster/logs/event.log deleted file mode 100644 index c34ebbb6b..000000000 --- a/analytics/dagster/logs/event.log +++ /dev/null @@ -1,3 +0,0 @@ -{"action": "GRAPHQL_QUERY_COMPLETED", "client_time": "2025-01-09 21:12:13.591000", "event_id": "8d7caa20-5400-49a2-9159-c63499697e37", "elapsed_time": "None", "instance_id": "3535e14f-e2a7-4b35-a902-c2e428480e12", "metadata": {"client_id": "05fdde35-9a31-4bf4-a851-823d66198481", "operationName": "PipelineExplorerRootQuery", "elapsedTime": "216"}, "python_version": "3.10.16", "dagster_version": "1.9.6", "os_desc": "Linux-6.10.14-linuxkit-x86_64-with-glibc2.36", "os_platform": "Linux", "run_storage_id": "8cb599b9-bf33-4d0d-932b-b5fa375d93e0", "is_known_ci_env": false} -{"action": "GRAPHQL_QUERY_COMPLETED", "client_time": "2025-01-09 21:13:04.243000", "event_id": "5ffc4cb3-4dab-4ac9-8120-cf172528193c", "elapsed_time": "None", "instance_id": "3535e14f-e2a7-4b35-a902-c2e428480e12", "metadata": {"client_id": "05fdde35-9a31-4bf4-a851-823d66198481", "operationName": "PipelineExplorerRootQuery", "elapsedTime": "89.40000000596046"}, "python_version": "3.10.16", "dagster_version": "1.9.6", "os_desc": "Linux-6.10.14-linuxkit-x86_64-with-glibc2.36", "os_platform": "Linux", "run_storage_id": "8cb599b9-bf33-4d0d-932b-b5fa375d93e0", "is_known_ci_env": false} -{"action": "GRAPHQL_QUERY_COMPLETED", "client_time": "2025-01-09 21:13:42.674000", "event_id": "92826caa-9682-4592-aa12-2f299c0378c7", "elapsed_time": "None", "instance_id": "3535e14f-e2a7-4b35-a902-c2e428480e12", "metadata": {"client_id": "05fdde35-9a31-4bf4-a851-823d66198481", "operationName": "PipelineExplorerRootQuery", "elapsedTime": "138.10000000149012"}, "python_version": "3.10.16", "dagster_version": "1.9.6", "os_desc": "Linux-6.10.14-linuxkit-x86_64-with-glibc2.36", "os_platform": "Linux", "run_storage_id": "8cb599b9-bf33-4d0d-932b-b5fa375d93e0", "is_known_ci_env": false} diff --git a/analytics/dagster/src/assets/dwh/setup_duckdb.py b/analytics/dagster/src/assets/dwh/setup_duckdb.py index 698b2fb68..0bd833059 100644 --- a/analytics/dagster/src/assets/dwh/setup_duckdb.py +++ b/analytics/dagster/src/assets/dwh/setup_duckdb.py @@ -1,4 +1,4 @@ -from dagster import asset +from dagster import MaterializeResult, asset from dagster_duckdb import DuckDBResource from ...config import Config @@ -10,12 +10,18 @@ compute_kind="duckdb", ) def setup_duckdb(context, duckdb: DuckDBResource): - SETUP_QUERY = f""" - SET memory_limit = '{Config.DUCKDB_MEMORY_LIMIT}GB'; - SET threads TO {Config.DUCKDB_THREAD_NUMBER}; - """ + context.log.info(f"Config.USE_MOTHER_DUCK{Config.USE_MOTHER_DUCK}") + if not Config.USE_MOTHER_DUCK: + SETUP_QUERY = f""" + SET memory_limit = '{Config.DUCKDB_MEMORY_LIMIT}GB'; + SET threads TO {Config.DUCKDB_THREAD_NUMBER}; + """ - with duckdb.get_connection() as conn: - context.log.info(f"Executing SQL: {SETUP_QUERY}") - conn.execute(SETUP_QUERY) + with duckdb.get_connection() as conn: + context.log.info(f"Executing SQL: {SETUP_QUERY}") + conn.execute(SETUP_QUERY) + + return "DuckDB setup successfully" + else: + return "Mother Duck is used, no need to setup DuckDB" diff --git a/analytics/dagster/src/config.py b/analytics/dagster/src/config.py index 07fc65f97..8a7937093 100644 --- a/analytics/dagster/src/config.py +++ b/analytics/dagster/src/config.py @@ -25,6 +25,11 @@ class Config: DUCKDB_MEMORY_LIMIT = os.environ.get("DUCKDB_MEMORY_LIMIT") DUCKDB_THREAD_NUMBER = os.environ.get("DUCKDB_THREAD_NUMBER", 4) METABASE_APP_ID = os.environ.get("METABASE_APP_ID") + + MD_TOKEN = os.environ.get("MD_TOKEN") + USE_MOTHER_DUCK = os.environ.get("USE_MOTHER_DUCK", True) + USE_MOTHER_DUCK_FOR_METABASE = os.environ.get("USE_MOTHER_DUCK_FOR_METABASE", False) + diff --git a/analytics/dagster/src/definitions.py b/analytics/dagster/src/definitions.py index f4c0fa78d..108210305 100644 --- a/analytics/dagster/src/definitions.py +++ b/analytics/dagster/src/definitions.py @@ -8,7 +8,7 @@ # from .assets import dagster_production_assets from .assets import dwh - +from .config import Config # from dagster_embedded_elt.dlt import DagsterDltResource from dagster_dbt import DbtCliResource @@ -91,10 +91,10 @@ # "dlt": dlt_resource, "dbt": dbt_resource, "duckdb": DuckDBResource( - database="db/dagster.duckdb", # required + database=f"md:dwh?motherduck_token={Config.MD_TOKEN}" if Config.USE_MOTHER_DUCK else "db/dagster.duckdb", ), "duckdb_metabase": DuckDBResource( - database="db/duckdb_metabase.duckdb", # required + database=f"md:metabase?motherduck_token={Config.MD_TOKEN} " if Config.USE_MOTHER_DUCK_FOR_METABASE else "db/metabase.duckdb", ), }, schedules=[daily_refresh_schedule, yearly_ff_refresh_schedule], diff --git a/analytics/dbt/profiles.yml b/analytics/dbt/profiles.yml index b05d210d3..c3b73e183 100644 --- a/analytics/dbt/profiles.yml +++ b/analytics/dbt/profiles.yml @@ -18,7 +18,7 @@ duckdb_profile: prod: type: duckdb - path: "/opt/dagster/dagster_home/db/dagster.duckdb" + path: "md:dwh?motherduck_token={{env_var('MD_TOKEN')}}" extensions: - httpfs - parquet