Skip to content

Commit

Permalink
Add an advanced dynamic_study_prefix field to manifests. (#309)
Browse files Browse the repository at this point in the history
This will allow a study to provide a Python file that accepts study
options and generates an appropriate study prefix.

It's largely targeted at the data_metrics study, but might have
application elsewhere.
  • Loading branch information
mikix authored Oct 25, 2024
1 parent fb0823d commit f131f24
Show file tree
Hide file tree
Showing 16 changed files with 321 additions and 66 deletions.
60 changes: 57 additions & 3 deletions MAINTAINER.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Maintainer notes

This document is intended for users contributing/manitaining this repository.
This document is intended for users contributing/maintaining this repository.
It is not comprehensive, but aims to capture items relevant to architecture
that are not covered in another document.

Expand All @@ -10,7 +10,7 @@ Since these terms are used somewhat haphazardly in different database implementa
we'll quickly define them for the purposes of this document:

- database - a single instance of a database product, local or in the cloud. it can
contain serveral schemas.
contain several schemas.
- schema - a namespaced collection of tables inside a database

Cumulus, as a holistic system, is designed to allow querying against the entire history
Expand All @@ -36,4 +36,58 @@ via cross-database joining. Additional tables should not be created by users in
schemas.

A user could elect to use these vocabulary builders and skip the entire rest of the
Cumulus ecosystem, if they wanted to.
Cumulus ecosystem, if they wanted to.

## Advanced study features

These features are for very narrow and advanced use cases,
designed for internal project studies (like `core`, `discovery`, or `data_metrics`).

### Dynamic prefixes

The `data_metrics` study wants to be able to generate an analysis of a single study cohort's data.
In order to do this, it needs to privately namespace that analysis.

The solution we use for this is to allow a study to dynamically generate the prefix it will use.
Thus, the `data_metrics` study can use a prefix like `data_metrics_hypertension__` for a
`hypertension` study and `data_metrics_covid__` for a `covid` study.

#### Config
Add a field called `dynamic_study_prefix` pointing at a local Python file.
If this field is present, any `study_prefix` field is ignored.
```toml
dynamic_study_prefix = "gen_prefix.py"
```

#### Generator file

Your generator file will be called as a Python script,
with the `--option` arguments that Cumulus Library gets
(but without the `--option` bit).
You should ignore unknown options, for forward compatibility.

You should print your desired prefix to stdout.

Example:
```python
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--study")
args, _rest = parser.parse_known_args()

if args.study:
print(f"data_metrics_{args.study}")
else:
print("data_metrics")
```

#### Usage

Your study still has to be selected using its original name (`--target=data_metrics`),
but the resulting tables will be prefixed using the generated study name.

This command line would build a `data_metrics_hypertension` study:
```sh
cumulus-library build -t data_metrics --option study:hypertension
```
2 changes: 1 addition & 1 deletion cumulus_library/actions/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _load_and_execute_builder(
:keyword filename: filename of a module implementing a TableBuilder
:keyword db_parser: an object implementing DatabaseParser for the target database
:keyword write_reference_sql: if true, writes sql to disk inside a study's directory
:keyword doc_string: A string to insert between queries written to disk
:keyword doc_str: A string to insert between queries written to disk
"""

# Since we have to support arbitrary user-defined python files here, we
Expand Down
33 changes: 13 additions & 20 deletions cumulus_library/builders/counts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Class for generating counts tables from templates"""

import sys
from pathlib import Path

from cumulus_library import BaseTableBuilder, errors, study_manifest
Expand All @@ -14,37 +13,28 @@ class CountsBuilder(BaseTableBuilder):
"""Extends BaseTableBuilder for counts-related use cases"""

def __init__(self, study_prefix: str | None = None):
if study_prefix is None:
# This slightly wonky approach will give us the path of the
# directory of a class extending the CountsBuilder
study_path = Path(sys.modules[self.__module__].__file__).parent

try:
parser = study_manifest.StudyManifest(study_path)
self.study_prefix = parser.get_study_prefix()
except Exception as e:
raise errors.CountsBuilderError(
"CountsBuilder must be either initiated with a study prefix, "
"or be in a directory with a valid manifest.toml"
) from e
else:
self.study_prefix = study_prefix
super().__init__()
self.study_prefix = study_prefix

def get_table_name(self, table_name: str, duration=None) -> str:
"""Convenience method for constructing table name
:param table_name: table name to add after the study prefix
:param duration: a time period reflecting the table binning strategy
"""
if not self.study_prefix:
raise errors.CountsBuilderError(
"CountsBuilder must be either initiated with a study prefix, "
"or be in a directory with a valid manifest.toml"
)
if duration:
return f"{self.study_prefix}__{table_name}_{duration}"
else:
return f"{self.study_prefix}__{table_name}"

def get_where_clauses(
self, clause: list | str | None = None, min_subject: int | None = None
) -> str:
) -> list[str]:
"""Convenience method for constructing arbitrary where clauses.
:param clause: either a string or a list of sql where statements
Expand Down Expand Up @@ -295,10 +285,13 @@ def write_counts(self, filepath: str):
self.comment_queries()
self.write_queries(path=Path(filepath))

def prepare_queries(self, cursor: object | None = None, schema: str | None = None):
"""Stub implementing abstract base class
def prepare_queries(
self, *args, manifest: study_manifest.StudyManifest | None = None, **kwargs
):
"""Prepare count queries
This should be overridden in any count generator. See studies/core/count_core.py
for an example
"""
pass # pragma: no cover
if manifest and not self.study_prefix:
self.study_prefix = manifest.get_study_prefix()
63 changes: 44 additions & 19 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ class StudyRunner:
verbose = False
schema_name = None

def __init__(self, config: base_utils.StudyConfig, data_path: str):
def __init__(self, config: base_utils.StudyConfig, data_path: str | None):
self.config = config
self.data_path = data_path
self.data_path = data_path and pathlib.Path(data_path)

def get_config(self, manifest=study_manifest.StudyManifest):
def get_config(self, manifest: study_manifest.StudyManifest):
schema = base_utils.get_schema(self.config, manifest)
if schema == self.config.schema:
return self.config
Expand All @@ -57,6 +57,7 @@ def clean_study(
targets: list[str],
study_dict: dict,
*,
options: dict[str, str],
prefix: bool = False,
) -> None:
"""Removes study table/views from Athena.
Expand All @@ -65,8 +66,9 @@ def clean_study(
this can be useful for cleaning up tables if a study prefix is changed
for some reason.
:param target: The study prefix to use for IDing tables to remove
:param targets: The study prefixes to use for IDing tables to remove
:param study_dict: The dictionary of available study targets
:param options: The dictionary of study-specific options
:keyword prefix: If True, does a search by string prefix in place of study name
"""
if targets is None:
Expand All @@ -77,26 +79,28 @@ def clean_study(

for target in targets:
if prefix:
manifest = study_manifest.StudyManifest()
manifest = study_manifest.StudyManifest(options=options)
cleaner.clean_study(
config=self.get_config(manifest), manifest=manifest, prefix=target
)
else:
manifest = study_manifest.StudyManifest(study_dict[target])
manifest = study_manifest.StudyManifest(study_dict[target], options=options)
cleaner.clean_study(config=self.get_config(manifest), manifest=manifest)

def clean_and_build_study(
self,
target: pathlib.Path,
*,
options: dict[str, str],
continue_from: str | None = None,
) -> None:
"""Recreates study views/tables
:param target: A path to the study directory
:param options: The dictionary of study-specific options
:keyword continue_from: Restart a run from a specific sql file (for dev only)
"""
manifest = study_manifest.StudyManifest(target, self.data_path)
manifest = study_manifest.StudyManifest(target, self.data_path, options=options)
try:
builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest)
if not continue_from:
Expand Down Expand Up @@ -150,28 +154,39 @@ def run_matching_table_builder(
self,
target: pathlib.Path,
table_builder_name: str,
*,
options: dict[str, str],
) -> None:
"""Runs a single table builder
:param target: A path to the study directory
:param table_builder_name: a builder file referenced in the study's manifest
:param options: The dictionary of study-specific options
"""
manifest = study_manifest.StudyManifest(target)
manifest = study_manifest.StudyManifest(target, options=options)
builder.run_matching_table_builder(
config=self.get_config(manifest),
manifest=manifest,
builder=table_builder_name,
)

### Data exporters
def export_study(self, target: pathlib.Path, data_path: pathlib.Path, archive: bool) -> None:
def export_study(
self,
target: pathlib.Path,
data_path: pathlib.Path,
archive: bool,
*,
options: dict[str, str],
) -> None:
"""Exports aggregates defined in a manifest
:param target: A path to the study directory
:param datapath: The location to export data to
:param data_path: The location to export data to
:param archive: If true, will export all tables, otherwise uses manifest list
:param options: The dictionary of study-specific options
"""
manifest = study_manifest.StudyManifest(target, data_path)
manifest = study_manifest.StudyManifest(target, data_path, options=options)
exporter.export_study(
config=self.get_config(manifest),
manifest=manifest,
Expand All @@ -183,14 +198,14 @@ def generate_study_sql(
self,
target: pathlib.Path,
*,
builder: str | None = None,
options: dict[str, str],
) -> None:
"""Materializes study sql from templates
:param target: A path to the study directory
:keyword builder: Specify a single builder to generate sql from
:param options: The dictionary of study-specific options
"""
manifest = study_manifest.StudyManifest(target)
manifest = study_manifest.StudyManifest(target, options=options)
file_generator.run_generate_sql(config=self.get_config(manifest), manifest=manifest)

def generate_study_markdown(
Expand Down Expand Up @@ -252,7 +267,8 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]:
manifest_paths.update(get_studies_by_manifest_path(child_path))
elif child_path.name == "manifest.toml":
try:
manifest = study_manifest.StudyManifest(path)
# Pass empty options, since we are not in a study-specific context
manifest = study_manifest.StudyManifest(path, options={})
manifest_paths[manifest.get_study_prefix()] = path
except errors.StudyManifestParsingError as exc:
rich.print(f"[bold red] Ignoring study in '{path}': {exc}")
Expand Down Expand Up @@ -309,15 +325,19 @@ def run_cli(args: dict):
targets=args["target"],
study_dict=study_dict,
prefix=args["prefix"],
options=args["options"],
)
elif args["action"] == "build":
for target in args["target"]:
if args["builder"]:
runner.run_matching_table_builder(study_dict[target], args["builder"])
runner.run_matching_table_builder(
study_dict[target], args["builder"], options=args["options"]
)
else:
runner.clean_and_build_study(
study_dict[target],
continue_from=args["continue_from"],
options=args["options"],
)

elif args["action"] == "export":
Expand All @@ -337,7 +357,12 @@ def run_cli(args: dict):
if response.lower() != "y":
sys.exit()
for target in args["target"]:
runner.export_study(study_dict[target], args["data_path"], args["archive"])
runner.export_study(
study_dict[target],
args["data_path"],
args["archive"],
options=args["options"],
)

elif args["action"] == "import":
for archive in args["archive_path"]:
Expand All @@ -346,7 +371,7 @@ def run_cli(args: dict):

elif args["action"] == "generate-sql":
for target in args["target"]:
runner.generate_study_sql(study_dict[target], builder=args["builder"])
runner.generate_study_sql(study_dict[target], options=args["options"])

elif args["action"] == "generate-md":
for target in args["target"]:
Expand Down Expand Up @@ -429,7 +454,7 @@ def main(cli_args=None):
options = {}
cli_options = args.get("options") or []
for c_arg in cli_options:
c_arg = c_arg.split(":", 2)
c_arg = c_arg.split(":", 1)
if len(c_arg) == 1:
sys.exit(
f"Custom argument '{c_arg}' is not validly formatted.\n"
Expand Down
1 change: 1 addition & 0 deletions cumulus_library/studies/core/count_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def count_core_patient(self):
return self.count_patient(table_name, from_table, cols)

def prepare_queries(self, *args, **kwargs):
super().prepare_queries(*args, **kwargs)
self.queries = [
self.count_core_allergyintolerance(duration="month"),
self.count_core_condition(duration="month"),
Expand Down
Loading

0 comments on commit f131f24

Please sign in to comment.