Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Converted manifest to single list of files #317

Merged
merged 2 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 60 additions & 87 deletions cumulus_library/actions/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,60 +135,14 @@ def run_protected_table_builder(
)


def run_table_builder(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
*,
db_parser: databases.DatabaseParser = None,
) -> None:
"""Loads modules from a manifest and executes code via BaseTableBuilder

:param config: a StudyConfig object
:param manifest: a StudyManifest object
:keyword db_parser: an object implementing DatabaseParser for the target database
"""
for file in manifest.get_table_builder_file_list():
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
db_parser=db_parser,
)


def run_counts_builders(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
) -> None:
"""Loads counts modules from a manifest and executes code via BaseTableBuilder

While a count is a form of statistics, it is treated separately from other
statistics because it is, by design, always going to be static against a
given dataset, where other statistical methods may use sampling techniques
or adjustable input parameters that may need to be preserved for later review.

:param config: a StudyConfig object
:param manifest: a StudyManifest object
"""
for file in manifest.get_counts_builder_file_list():
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
)


def run_statistics_builders(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
def _run_workflow(
config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
) -> None:
"""Loads statistics modules from toml definitions and executes
"""Loads workflow config from toml definitions and executes workflow

:param config: a StudyConfig object
:param manifest: a StudyManifest object
"""
if len(manifest.get_statistics_file_list()) == 0:
return
existing_stats = []
if not config.stats_build:
existing_stats = (
Expand All @@ -199,40 +153,41 @@ def run_statistics_builders(
)
.fetchall()
)
for file in manifest.get_statistics_file_list():
# This open is a bit redundant with the open inside of the PSM builder,
# but we're letting it slide so that builders function similarly
# across the board
safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
with open(toml_path, "rb") as file:
stats_config = tomllib.load(file)
config_type = stats_config["config_type"]
target_table = stats_config.get("target_table", stats_config.get("table_prefix", ""))

if (target_table,) in existing_stats and not config.stats_build:
continue
if config_type == "psm":
# This open is a bit redundant with the open inside of the PSM builder,
# but we're letting it slide so that builders function similarly
# across the board
safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
with open(toml_path, "rb") as file:
workflow_config = tomllib.load(file)
config_type = workflow_config["config_type"]
target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", ""))

if (target_table,) in existing_stats and not config.stats_build:
return
match config_type:
case "psm":
builder = psm_builder.PsmBuilder(
toml_config_path=toml_path,
config=stats_config,
config=workflow_config,
data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm",
)
elif config_type == "valueset":
case "valueset":
builder = valueset_builder.ValuesetBuilder(
toml_config_path=toml_path,
config=stats_config,
config=workflow_config,
data_path=manifest.data_path / f"{manifest.get_study_prefix()}/valueset",
)
else:
case _:
raise errors.StudyManifestParsingError( # pragma: no cover
f"{toml_path} references an invalid statistics type {config_type}."
f"{toml_path} references an invalid workflow type {config_type}."
)
builder.execute_queries(
config=config,
manifest=manifest,
table_suffix=safe_timestamp,
)
builder.execute_queries(
config=config,
manifest=manifest,
table_suffix=safe_timestamp,
)
if config_type in set(item.value for item in enums.StatisticsTypes):
log_utils.log_statistics(
config=config,
manifest=manifest,
Expand All @@ -242,7 +197,7 @@ def run_statistics_builders(
)


def run_matching_table_builder(
def build_matching_files(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
*,
Expand All @@ -256,34 +211,52 @@ def run_matching_table_builder(
:keyword builder: filename of a module implementing a TableBuilder
:keyword db_parser: an object implementing DatabaseParser for the target database"""
all_generators = manifest.get_all_generators()
matches = []
for file in all_generators:
if builder and file.find(builder) == -1:
continue
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
db_parser=db_parser,
)
if builder and file.find(builder) != -1:
dogversioning marked this conversation as resolved.
Show resolved Hide resolved
matches.append(file)
build_study(config, manifest, db_parser=db_parser, file_list=matches)


def build_study(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
*,
db_parser: databases.DatabaseParser = None,
continue_from: str | None = None,
file_list: list | None = None,
) -> list:
"""Creates tables in the schema by iterating through the sql_config.file_names

:param config: a StudyConfig object
:param manifest: a StudyManifest object
:keyword continue_from: Name of a sql file to resume table creation from
:keyword continue_from: Name of a file to resume table creation from
:returns: loaded queries (for unit testing only)
"""
if file_list is None:
file_list = manifest.get_file_list(continue_from)
for file in file_list:
if file.endswith(".py"):
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
db_parser=db_parser,
)
elif file.endswith(".toml"):
_run_workflow(config=config, manifest=manifest, filename=file)
elif file.endswith(".sql"):
_run_raw_queries(config=config, manifest=manifest, filename=file)
else:
raise errors.StudyManifestParsingError


def _run_raw_queries(
config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
):
queries = []
for file in manifest.get_sql_file_list(continue_from):
for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{file}")):
queries.append([query, file])
for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")):
queries.append([query, filename])
if len(queries) == 0:
return []
for query in queries:
Expand All @@ -298,7 +271,7 @@ def build_study(
# We want to only show a progress bar if we are :not: printing SQL lines
with base_utils.get_progress_bar(disable=config.verbose) as progress:
task = progress.add_task(
f"Creating {manifest.get_study_prefix()} study in db...",
f"Building tables from {filename}...",
total=len(queries),
visible=not config.verbose,
)
Expand Down
31 changes: 22 additions & 9 deletions cumulus_library/builders/protected_table_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Builder for creating tables for tracking state/logging changes"""

import pathlib
import tomllib

from cumulus_library import (
BaseTableBuilder,
base_utils,
Expand Down Expand Up @@ -64,12 +67,22 @@ def prepare_queries(
TRANSACTION_COLS_TYPES,
)
)
if manifest._study_config.get("statistics_config"):
self.queries.append(
base_templates.get_ctas_empty_query(
db_schema,
statistics,
STATISTICS_COLS,
STATISTICS_COLS_TYPES,
)
)
files = manifest.get_file_list()
files = [file for file in files if file.endswith(".toml")]
dogversioning marked this conversation as resolved.
Show resolved Hide resolved
if len(files) == 0:
return
dogversioning marked this conversation as resolved.
Show resolved Hide resolved
stats_types = set(item.value for item in enums.StatisticsTypes)
for file in files:
toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
with open(toml_path, "rb") as file:
workflow_config = tomllib.load(file)
if workflow_config["config_type"] in stats_types:
self.queries.append(
base_templates.get_ctas_empty_query(
db_schema,
statistics,
STATISTICS_COLS,
STATISTICS_COLS_TYPES,
)
)
return
12 changes: 3 additions & 9 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def clean_and_build_study(
config=self.get_config(manifest),
manifest=manifest,
)
builder.run_table_builder(config=self.get_config(manifest), manifest=manifest)

else:
log_utils.log_transaction(
Expand All @@ -127,11 +126,6 @@ def clean_and_build_study(
manifest=manifest,
continue_from=continue_from,
)
builder.run_counts_builders(config=self.get_config(manifest), manifest=manifest)
builder.run_statistics_builders(
config=self.get_config(manifest),
manifest=manifest,
)
log_utils.log_transaction(
config=self.get_config(manifest),
manifest=manifest,
Expand All @@ -150,7 +144,7 @@ def clean_and_build_study(
)
raise e

def run_matching_table_builder(
def build_matching_files(
self,
target: pathlib.Path,
table_builder_name: str,
Expand All @@ -164,7 +158,7 @@ def run_matching_table_builder(
:param options: The dictionary of study-specific options
"""
manifest = study_manifest.StudyManifest(target, options=options)
builder.run_matching_table_builder(
builder.build_matching_files(
config=self.get_config(manifest),
manifest=manifest,
builder=table_builder_name,
Expand Down Expand Up @@ -330,7 +324,7 @@ def run_cli(args: dict):
elif args["action"] == "build":
for target in args["target"]:
if args["builder"]:
runner.run_matching_table_builder(
runner.build_matching_files(
study_dict[target], args["builder"], options=args["options"]
)
else:
Expand Down
6 changes: 6 additions & 0 deletions cumulus_library/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ class ProtectedTables(enum.Enum):
TRANSACTIONS = "lib_transactions"


class StatisticsTypes(enum.Enum):
"""A subset of workflows that create statistics sampling artifacts"""

PSM = "psm"


class LogStatuses(enum.Enum):
DEBUG = "debug"
ERROR = "error"
Expand Down
39 changes: 34 additions & 5 deletions cumulus_library/study_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,31 @@ def get_dedicated_schema(self) -> str | None:
options = self._study_config.get("advanced_options", {})
return options.get("dedicated_schema")

def get_file_list(self, continue_from: str | None = None) -> list[str] | None:
"""Reads the contents of the file_config array from the manifest

:returns: An array of files from the manifest, or None if not found.
"""
config = self._study_config.get("file_config", {})
files = config.get("file_names", []) or []
if not files:
files = (
self.get_table_builder_file_list()
+ self.get_sql_file_list()
+ self.get_counts_builder_file_list()
+ self.get_statistics_file_list()
)
if continue_from:
for pos, file in enumerate(files):
if continue_from.split(".", 1)[0] == file.split(".", 1)[0]:
files = files[pos:]
break
else:
raise errors.StudyManifestParsingError(f"No files matching '{continue_from}' found")
return files

# The following four functions are considered deprecated, and can be removed
# after we update studies to use the new methodology
def get_sql_file_list(self, continue_from: str | None = None) -> list[str] | None:
"""Reads the contents of the sql_config array from the manifest

Expand Down Expand Up @@ -134,6 +159,8 @@ def get_statistics_file_list(self) -> list[str] | None:
stats_config = self._study_config.get("statistics_config", {})
return stats_config.get("file_names", [])

# End of deprecated section

def get_export_table_list(self) -> list[ManifestExport] | None:
"""Reads the contents of the export_list array from the manifest

Expand Down Expand Up @@ -179,11 +206,13 @@ def get_export_table_list(self) -> list[ManifestExport] | None:

def get_all_generators(self) -> list[str]:
"""Convenience method for getting files that generate sql queries"""
return (
self.get_table_builder_file_list()
+ self.get_counts_builder_file_list()
+ self.get_statistics_file_list()
)
files = self.get_file_list()
return [file for file in files if file.endswith(".py")]
dogversioning marked this conversation as resolved.
Show resolved Hide resolved

def get_all_workflows(self) -> list[str]:
"""Convenience method for getting config files"""
files = self.get_file_list()
return [file for file in files if file.endswith(".toml")]

def get_prefix_with_seperator(self) -> str:
"""Convenience method for getting the appropriate prefix for tables"""
Expand Down
Loading
Loading