From 33498d88f0145a7f06d85897919935eddbd3fbc4 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 19 Dec 2024 15:44:16 +0100 Subject: [PATCH 01/31] draft script --- scripts/copy_specified_udf.py | 88 +++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 scripts/copy_specified_udf.py diff --git a/scripts/copy_specified_udf.py b/scripts/copy_specified_udf.py new file mode 100644 index 00000000..1069c992 --- /dev/null +++ b/scripts/copy_specified_udf.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +import logging +from argparse import ArgumentParser +from datetime import datetime as dt + +from genologics.config import BASEURI, PASSWORD, USERNAME +from genologics.entities import Process +from genologics.lims import Lims + +from scilifelab_epps.utils import udf_tools +from scilifelab_epps.wrapper import epp_decorator + +TIMESTAMP = dt.now().strftime("%y%m%d_%H%M%S") + + +@epp_decorator(script_path=__file__, timestamp=TIMESTAMP) +def main(): + """This script will get the name of an artifact UDF from a master step field, + and for every sample artifact in the current step: + + - Use API calls to recursively back-trace the sample history using + input-output links until it finds an artifact with the specified UDF + - Copy the value of the specified UDF from the found artifact to the + artifact of the current step + + """ + lims = Lims(BASEURI, USERNAME, PASSWORD) + process = Process(lims, id=args.pid) + + target_udf = process.udf("step_udf") + + no_outputs = udf_tools.no_outputs(process) + + if no_outputs: + logging.info("Step has no output artifacts. Assigning to input artifact.") + + art_tuples = udf_tools.get_art_tuples(process) + for art_tuple in art_tuples: + target_artifact = art_tuple[0]["uri"] if no_outputs else art_tuple[1]["uri"] + logging.info( + f"Looking for last recorded UDF '{target_udf}' of sample '{target_artifact.name}'..." + ) + udf_value, udf_history = udf_tools.fetch_last( + currentStep=process, + art_tuple=art_tuple, + target_udfs=target_udf, + use_current=False, + print_history=True, + on_fail=None, + ) + if udf_value: + logging.info(f"Traceback:\n{udf_history}") + target_artifact.udf[target_udf] = udf_value + target_artifact.put() + logging.info( + f"Updated UDF '{target_udf}' for '{art_tuple[1]['uri'].name}' to '{udf_value}'" + ) + else: + logging.warning( + f"Could not traceback UDF '{target_udf}' for '{art_tuple[1]['uri'].name}'" + ) + logging.info(f"Traceback:\n{udf_history}") + + +if __name__ == "__main__": + # Parse args + parser = ArgumentParser() + parser.add_argument( + "--pid", + required=True, + type=str, + help="Lims ID for current Process.", + ) + parser.add_argument( + "--log", + required=True, + type=str, + help="Which file slot to use for the script log.", + ) + parser.add_argument( + "--step_udf", + required=True, + type=str, + help="The name of the step UDF listing the target artifact UDF.", + ) + args = parser.parse_args() + + main(args) From 0cf2c0a705680a4ff8602704aaed41f0babaa7dc Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 19 Dec 2024 18:24:09 +0100 Subject: [PATCH 02/31] bugfix and none-case --- scripts/copy_specified_udf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/copy_specified_udf.py b/scripts/copy_specified_udf.py index 1069c992..6f9350d8 100644 --- a/scripts/copy_specified_udf.py +++ b/scripts/copy_specified_udf.py @@ -14,7 +14,7 @@ @epp_decorator(script_path=__file__, timestamp=TIMESTAMP) -def main(): +def main(args): """This script will get the name of an artifact UDF from a master step field, and for every sample artifact in the current step: @@ -27,7 +27,9 @@ def main(): lims = Lims(BASEURI, USERNAME, PASSWORD) process = Process(lims, id=args.pid) - target_udf = process.udf("step_udf") + target_udf = process.udf.get(args.step_udf, None) + if target_udf is None or target_udf == "None": + logging.error(f"No target UDF supplied from step field '{args.step_udf}'") no_outputs = udf_tools.no_outputs(process) From 3c88aac4fe9b3b92d8eeed18f0e0c3933b7f11bd Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 19 Dec 2024 18:45:59 +0100 Subject: [PATCH 03/31] wip, add TODOs --- scripts/copy_specified_udf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/copy_specified_udf.py b/scripts/copy_specified_udf.py index 6f9350d8..fd20a021 100644 --- a/scripts/copy_specified_udf.py +++ b/scripts/copy_specified_udf.py @@ -36,7 +36,9 @@ def main(args): if no_outputs: logging.info("Step has no output artifacts. Assigning to input artifact.") - art_tuples = udf_tools.get_art_tuples(process) + # TODO need to tweak this script and possible the traceback function to handle both + # TODO aggregate QC and regular steps + art_tuples = udf_tools.get_art_tuples(process) # TODO this returns [] for art_tuple in art_tuples: target_artifact = art_tuple[0]["uri"] if no_outputs else art_tuple[1]["uri"] logging.info( From f3381bbd659959c6d3550bffa72add0c8b29b649 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 12:25:30 +0100 Subject: [PATCH 04/31] rename script file and bump docstring --- scripts/{copy_specified_udf.py => fetch_last_known_field.py} | 3 +++ 1 file changed, 3 insertions(+) rename scripts/{copy_specified_udf.py => fetch_last_known_field.py} (93%) diff --git a/scripts/copy_specified_udf.py b/scripts/fetch_last_known_field.py similarity index 93% rename from scripts/copy_specified_udf.py rename to scripts/fetch_last_known_field.py index fd20a021..fc59ab49 100644 --- a/scripts/copy_specified_udf.py +++ b/scripts/fetch_last_known_field.py @@ -23,6 +23,9 @@ def main(args): - Copy the value of the specified UDF from the found artifact to the artifact of the current step + Example use-case: + - For Nanopore libraries in the Aggregate QC step of the Library Validation protocol, + fetch the last recorded artifact UDF "Size (bp)" from the library prep for all samples. """ lims = Lims(BASEURI, USERNAME, PASSWORD) process = Process(lims, id=args.pid) From ff412b0b85052b98b7149861688e5891d62d10e8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 12:57:58 +0100 Subject: [PATCH 05/31] live testing dev --- scilifelab_epps/utils/udf_tools.py | 43 ++++++++++++++++++++++-------- scripts/fetch_last_known_field.py | 43 +++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 278b196d..22ff69e4 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -136,19 +136,36 @@ def list_udfs(art: Artifact) -> list: def fetch_last( currentStep: Process, - art_tuple: tuple, target_udfs: str | list, + art_tuple: tuple = None, + art: Artifact = None, use_current=True, print_history=False, on_fail=AssertionError, ): """Recursively look for target UDF. - Target UDF can be supplied as a string, or as a prioritized list of strings. + Arguments: + + - "art_tuple": step input-output tuple. Mutually exclusive use with "art". + + - "art": step artifact, either input or output. Mutually exclusive use with "art_tuple". + + - "target_udfs": can be supplied as a string, or as a + prioritized list of strings. - If "print_history" == True, will return both the target metric and the lookup history as a string. + - "use_current": if true, will return the target metric + if found in the current step. + + - "print_history": if true, will return both the target + metric and the lookup history as a string. """ + assert art_tuple or art, "One of function args 'art_tuple' and 'art' are required." + assert not ( + art_tuple and art + ), "Function args 'art_tuple' and 'art' are mutually exclusive." + # Convert to list, to enable iteration if isinstance(target_udfs, str): target_udfs = [target_udfs] @@ -158,15 +175,19 @@ def fetch_last( while True: history.append({"Step name": currentStep.type.name, "Step ID": currentStep.id}) - # Try to grab input and output articles, if possible - try: - input_art = art_tuple[0]["uri"] - except: - input_art = None - try: - output_art = art_tuple[1]["uri"] - except: + if len(history) == 1 and not art_tuple: + # Handle the case of having an art instead of an art_tuple in the original step + input_art = art output_art = None + else: + try: + input_art = art_tuple[0]["uri"] + except: + input_art = None + try: + output_art = art_tuple[1]["uri"] + except: + output_art = None if len(history) == 1 and use_current is not True: # If we are in the original step and "use_current" is false, skip diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index fc59ab49..dae82767 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -4,7 +4,7 @@ from datetime import datetime as dt from genologics.config import BASEURI, PASSWORD, USERNAME -from genologics.entities import Process +from genologics.entities import Artifact, Process from genologics.lims import Lims from scilifelab_epps.utils import udf_tools @@ -30,41 +30,58 @@ def main(args): lims = Lims(BASEURI, USERNAME, PASSWORD) process = Process(lims, id=args.pid) + # Get the target UDF from the step field target_udf = process.udf.get(args.step_udf, None) - if target_udf is None or target_udf == "None": - logging.error(f"No target UDF supplied from step field '{args.step_udf}'") + assert ( + target_udf is not None or target_udf != "None" + ), f"No target UDF supplied from step field '{args.step_udf}'" - no_outputs = udf_tools.no_outputs(process) + # Check whether process has output artifacts, not the case for e.g. QC steps + no_outputs: bool = udf_tools.no_outputs(process) + # Load input artifacts + arts_in: list[Artifact] = [ + art for art in process.all_inputs() if art.type == "Analyte" + ] + + # Find target output artifacts, if any if no_outputs: logging.info("Step has no output artifacts. Assigning to input artifact.") + else: + art_tuples: list[tuple[dict]] = process.input_output_maps + art_in2out: dict[Process.Artifact : Process.Artifact] = { + i["uri"]: o["uri"] + for i, o in art_tuples + if i["uri"].type == "Analyte" and o["uri"].type == "Analyte" + } - # TODO need to tweak this script and possible the traceback function to handle both - # TODO aggregate QC and regular steps - art_tuples = udf_tools.get_art_tuples(process) # TODO this returns [] - for art_tuple in art_tuples: - target_artifact = art_tuple[0]["uri"] if no_outputs else art_tuple[1]["uri"] + for art_in in arts_in: + if no_outputs: + target_artifact = art_in + else: + target_artifact = art_in2out[art_in] logging.info( - f"Looking for last recorded UDF '{target_udf}' of sample '{target_artifact.name}'..." + f"Looking for last recorded UDF '{target_udf}' of {'input' if no_outputs else 'output'} artifact '{target_artifact.name}'..." ) udf_value, udf_history = udf_tools.fetch_last( currentStep=process, - art_tuple=art_tuple, + art=art_in, target_udfs=target_udf, use_current=False, print_history=True, on_fail=None, ) if udf_value: + logging.info(f"Found target UDF '{target_udf}' with value '{udf_value}'") logging.info(f"Traceback:\n{udf_history}") target_artifact.udf[target_udf] = udf_value target_artifact.put() logging.info( - f"Updated UDF '{target_udf}' for '{art_tuple[1]['uri'].name}' to '{udf_value}'" + f"Updated UDF '{target_udf}' for '{art_in.name}' to '{udf_value}'" ) else: logging.warning( - f"Could not traceback UDF '{target_udf}' for '{art_tuple[1]['uri'].name}'" + f"Could not traceback UDF '{target_udf}' for '{art_in.name}'" ) logging.info(f"Traceback:\n{udf_history}") From 89c7a80d9349a44b248744f4815b12cf537ee812 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 13:00:53 +0100 Subject: [PATCH 06/31] fix --- scripts/fetch_last_known_field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index dae82767..a0277ae5 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -33,7 +33,7 @@ def main(args): # Get the target UDF from the step field target_udf = process.udf.get(args.step_udf, None) assert ( - target_udf is not None or target_udf != "None" + target_udf is not None and target_udf != "None" ), f"No target UDF supplied from step field '{args.step_udf}'" # Check whether process has output artifacts, not the case for e.g. QC steps From a7c65e6484517cece542596539e92cbb491c1c03 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 13:23:38 +0100 Subject: [PATCH 07/31] var ref fixes --- scripts/fetch_last_known_field.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index a0277ae5..13dc47aa 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -30,7 +30,7 @@ def main(args): lims = Lims(BASEURI, USERNAME, PASSWORD) process = Process(lims, id=args.pid) - # Get the target UDF from the step field + # Get the name of the target UDF from the step field target_udf = process.udf.get(args.step_udf, None) assert ( target_udf is not None and target_udf != "None" @@ -49,8 +49,8 @@ def main(args): logging.info("Step has no output artifacts. Assigning to input artifact.") else: art_tuples: list[tuple[dict]] = process.input_output_maps - art_in2out: dict[Process.Artifact : Process.Artifact] = { - i["uri"]: o["uri"] + art_in2out: dict[str:Artifact] = { + i["uri"].id: o["uri"] for i, o in art_tuples if i["uri"].type == "Analyte" and o["uri"].type == "Analyte" } @@ -59,13 +59,13 @@ def main(args): if no_outputs: target_artifact = art_in else: - target_artifact = art_in2out[art_in] + target_artifact = art_in2out[art_in.id] logging.info( f"Looking for last recorded UDF '{target_udf}' of {'input' if no_outputs else 'output'} artifact '{target_artifact.name}'..." ) udf_value, udf_history = udf_tools.fetch_last( currentStep=process, - art=art_in, + art=target_artifact, target_udfs=target_udf, use_current=False, print_history=True, @@ -77,11 +77,11 @@ def main(args): target_artifact.udf[target_udf] = udf_value target_artifact.put() logging.info( - f"Updated UDF '{target_udf}' for '{art_in.name}' to '{udf_value}'" + f"Updated UDF '{target_udf}' for {'input' if no_outputs else 'output'} '{target_artifact.name}' to '{udf_value}'" ) else: logging.warning( - f"Could not traceback UDF '{target_udf}' for '{art_in.name}'" + f"Could not traceback UDF '{target_udf}' for {'input' if no_outputs else 'output'} artifact '{target_artifact.name}'" ) logging.info(f"Traceback:\n{udf_history}") From 6e418246e2aee38acdef1a915b61410bddfb4200 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 13:27:18 +0100 Subject: [PATCH 08/31] bump logs --- scripts/fetch_last_known_field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index 13dc47aa..4f7b7c4e 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -77,7 +77,7 @@ def main(args): target_artifact.udf[target_udf] = udf_value target_artifact.put() logging.info( - f"Updated UDF '{target_udf}' for {'input' if no_outputs else 'output'} '{target_artifact.name}' to '{udf_value}'" + f"Updated UDF '{target_udf}' for {'input' if no_outputs else 'output'} artifact '{target_artifact.name}' to '{udf_value}'" ) else: logging.warning( From 3434eed6f72721295d09800178285c98edeb4279 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 13:30:48 +0100 Subject: [PATCH 09/31] bump vlog --- VERSIONLOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index da67e885..1834d8fa 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # Scilifelab_epps Version Log +## 20241220.1 + +Introduce EPP to fetch last recorded derived sample UDF. + ## 20241211.1 No longer reserve PromethION column 3 for Clinical Genomics. From d6ade1ba10bc3ae26f29ceb18c844c1f5c991181 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 20 Dec 2024 13:40:01 +0100 Subject: [PATCH 10/31] shut up mypy --- scilifelab_epps/utils/udf_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 22ff69e4..444d7690 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -137,8 +137,8 @@ def list_udfs(art: Artifact) -> list: def fetch_last( currentStep: Process, target_udfs: str | list, - art_tuple: tuple = None, - art: Artifact = None, + art_tuple=None, + art=None, use_current=True, print_history=False, on_fail=AssertionError, @@ -147,9 +147,9 @@ def fetch_last( Arguments: - - "art_tuple": step input-output tuple. Mutually exclusive use with "art". + - "art_tuple": step input-output tuple or none. Mutually exclusive use with "art". - - "art": step artifact, either input or output. Mutually exclusive use with "art_tuple". + - "art": step artifact, either input or output or none. Mutually exclusive use with "art_tuple". - "target_udfs": can be supplied as a string, or as a prioritized list of strings. From 901ef861db7f781b00b4d397f7d12267ef48ceba Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 12:51:07 +0100 Subject: [PATCH 11/31] wip --- scilifelab_epps/utils/udf_tools.py | 62 +++++++++++++++++------------- scripts/fetch_last_known_field.py | 3 ++ 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 444d7690..658ff640 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -172,10 +172,14 @@ def fetch_last( history = [] + # Track iterations + n = 1 + + # Start traceback while True: history.append({"Step name": currentStep.type.name, "Step ID": currentStep.id}) - if len(history) == 1 and not art_tuple: + if n == 1 and not art_tuple: # Handle the case of having an art instead of an art_tuple in the original step input_art = art output_art = None @@ -189,20 +193,20 @@ def fetch_last( except: output_art = None - if len(history) == 1 and use_current is not True: - # If we are in the original step and "use_current" is false, skip - pass - else: - # Look trough outputs - if output_art: - history[-1].update( - { - "Derived sample ID": output_art.id, - "Derived sample UDFs": dict(output_art.udf.items()), - } - ) + # Look trough outputs + if output_art: + history[-1].update( + { + "Derived sample ID": output_art.id, + "Derived sample UDFs": dict(output_art.udf.items()), + } + ) - for target_udf in target_udfs: + for target_udf in target_udfs: + # Don't search outputs of first and second iteration if use_current is False + if n in [1, 2] and use_current is False: + pass + else: if target_udf in list_udfs(output_art): if print_history is True: return output_art.udf[target_udf], json.dumps( @@ -211,22 +215,26 @@ def fetch_last( else: return output_art.udf[target_udf] - # Look through inputs - if input_art: - if input_art.parent_process: - history[-1].update( - { - "Input sample parent step name": input_art.parent_process.type.name, - "Input sample parent step ID": input_art.parent_process.id, - } - ) + # Look through inputs + if input_art: + if input_art.parent_process: history[-1].update( { - "Input sample ID": input_art.id, - "Input sample UDFs": dict(input_art.udf.items()), + "Input sample parent step name": input_art.parent_process.type.name, + "Input sample parent step ID": input_art.parent_process.id, } ) - for target_udf in target_udfs: + history[-1].update( + { + "Input sample ID": input_art.id, + "Input sample UDFs": dict(input_art.udf.items()), + } + ) + for target_udf in target_udfs: + # Don't search inputs of first iteration if use_current is False + if n == 1 and use_current is False: + pass + else: if target_udf in list_udfs(input_art): if print_history is True: return input_art.udf[target_udf], json.dumps( @@ -265,6 +273,8 @@ def fetch_last( currentStep = pp art_tuple = matching_tuples[0] + n += 1 + except AssertionError: if isinstance(on_fail, type) and issubclass(on_fail, Exception): if print_history is True: diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index 4f7b7c4e..91fd6f78 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -44,6 +44,9 @@ def main(args): art for art in process.all_inputs() if art.type == "Analyte" ] + # TODO currently even steps with valid tuples will only use input artifacts + # No traceback provided for output artifact of current step + # Find target output artifacts, if any if no_outputs: logging.info("Step has no output artifacts. Assigning to input artifact.") From f07296b5226ca39e780e1476ec7ad9ed997bca29 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:15:24 +0100 Subject: [PATCH 12/31] big swap, need testing --- .../calc_from_args/udf_arg_methods.py | 14 +- scilifelab_epps/utils/udf_tools.py | 234 +++++++----------- scilifelab_epps/zika/utils.py | 4 +- scripts/fetch_last_known_field.py | 13 +- scripts/log_udfs.py | 2 +- scripts/ont_calc_volumes.py | 6 +- scripts/ont_update_amount.py | 6 +- 7 files changed, 111 insertions(+), 168 deletions(-) diff --git a/scilifelab_epps/calc_from_args/udf_arg_methods.py b/scilifelab_epps/calc_from_args/udf_arg_methods.py index 1040c91d..b7a5defd 100644 --- a/scilifelab_epps/calc_from_args/udf_arg_methods.py +++ b/scilifelab_epps/calc_from_args/udf_arg_methods.py @@ -47,19 +47,13 @@ def fetch_from_arg( value = process.udf[arg_dict["udf"]] else: if arg_dict["recursive"]: - # Fetch UDF recursively, back-tracking the input-output tuple - if arg_dict["source"] == "input": - use_current = False - else: - assert arg_dict["source"] == "output" - use_current = True + # Fetch UDF recursively value, history = udf_tools.fetch_last( - currentStep=process, - art_tuple=art_tuple, + target_art=source, target_udfs=arg_dict["udf"], - use_current=use_current, - print_history=True, + log_traceback=True, + return_traceback=True, ) else: # Fetch UDF from input or output artifact diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 658ff640..eb5fc2e9 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -1,4 +1,5 @@ import json +import logging from typing import Union from genologics.entities import Artifact, Process @@ -39,22 +40,6 @@ def is_filled(art: Artifact, target_udf: str) -> bool: return False -def no_outputs(currentStep: Process) -> bool: - """Check whether step has outputs or not""" - - art_tuples = get_art_tuples(currentStep) - - if art_tuples: - none_outputs = [t[1] is None for t in art_tuples] - - if all(none_outputs): - return True - else: - return False - else: - return True - - def get_art_tuples(currentStep: Process) -> list: """Return I/O tuples whose elements are either 1) both analytes @@ -135,156 +120,119 @@ def list_udfs(art: Artifact) -> list: def fetch_last( - currentStep: Process, + target_art: Artifact, target_udfs: str | list, - art_tuple=None, - art=None, - use_current=True, - print_history=False, - on_fail=AssertionError, -): + log_traceback=False, + return_traceback=False, + on_fail=None, +) -> (str | int | float) | tuple[str | int | float, dict]: """Recursively look for target UDF. Arguments: - - "art_tuple": step input-output tuple or none. Mutually exclusive use with "art". + target_art Artifact to traceback and assign UDF value to. - - "art": step artifact, either input or output or none. Mutually exclusive use with "art_tuple". + target_udfs Can be supplied as a string, or as a prioritized + list of strings. - - "target_udfs": can be supplied as a string, or as a - prioritized list of strings. + log_traceback If True, will log the full traceback. - - "use_current": if true, will return the target metric - if found in the current step. + return_traceback If True, will return the traceback too. - - "print_history": if true, will return both the target - metric and the lookup history as a string. + on_fail If not None, will return this value on failure. """ - assert art_tuple or art, "One of function args 'art_tuple' and 'art' are required." - assert not ( - art_tuple and art - ), "Function args 'art_tuple' and 'art' are mutually exclusive." - # Convert to list, to enable iteration if isinstance(target_udfs, str): target_udfs = [target_udfs] - history = [] + # Instantiate traceback + traceback = [] + steps_visited = [] - # Track iterations - n = 1 + try: + # First iteration, current artifact is the target artifact. Don't pull any UDF values. + current_art = target_art + pp = current_art.parent_process + assert pp, f"Artifact '{current_art.name}' ({current_art.id}) has no parent process linked." + steps_visited.append(f"'{pp.type.name}' ({pp.id})") + + traceback.append( + { + "Artifact": { + "Name": current_art.name, + "ID": current_art.id, + "UDFs": dict(current_art.udf.items()), + "Parent Step": { + "Name": pp.type.name if pp else None, + "ID": pp.id if pp else None, + }, + } + } + ) - # Start traceback - while True: - history.append({"Step name": currentStep.type.name, "Step ID": currentStep.id}) + # Start recursive search + while True: + pp_art_tuples = get_art_tuples(pp) + + # If parent process has valid input-output tuples, use for linkage + if pp_art_tuples != []: + for pp_tuple in pp_art_tuples: + if pp_tuple[1]["uri"].id == current_art.id: + current_art = pp_tuple[0]["uri"] + break + # If not, TODO + else: + raise NotImplementedError() - if n == 1 and not art_tuple: - # Handle the case of having an art instead of an art_tuple in the original step - input_art = art - output_art = None - else: - try: - input_art = art_tuple[0]["uri"] - except: - input_art = None - try: - output_art = art_tuple[1]["uri"] - except: - output_art = None + pp = current_art.parent_process + if pp is not None: + steps_visited.append(f"'{pp.type.name}' ({pp.id})") - # Look trough outputs - if output_art: - history[-1].update( + traceback.append( { - "Derived sample ID": output_art.id, - "Derived sample UDFs": dict(output_art.udf.items()), + "Artifact": { + "Name": current_art.name, + "ID": current_art.id, + "UDFs": dict(current_art.udf.items()), + "Parent Step": { + "Name": pp.type.name if pp else None, + "ID": pp.id if pp else None, + }, + } } ) + # Search for correct UDF for target_udf in target_udfs: - # Don't search outputs of first and second iteration if use_current is False - if n in [1, 2] and use_current is False: - pass - else: - if target_udf in list_udfs(output_art): - if print_history is True: - return output_art.udf[target_udf], json.dumps( - history, indent=2 - ) - else: - return output_art.udf[target_udf] - - # Look through inputs - if input_art: - if input_art.parent_process: - history[-1].update( - { - "Input sample parent step name": input_art.parent_process.type.name, - "Input sample parent step ID": input_art.parent_process.id, - } + if target_udf in list_udfs(current_art): + if log_traceback is True: + logging.info(f"Traceback:\n{json.dumps(traceback, indent=2)}") + logging.info( + f"Found target UDF '{target_udf}'" + + f" with value '{current_art.udf[target_udf]}'" + + f" in process {steps_visited[-1]}" + + f" {'output' if pp else 'input'}" + + f" artifact '{current_art.name}' ({current_art.id})" + ) + + if return_traceback: + return current_art.udf[target_udf], traceback + else: + return current_art.udf[target_udf] + + if pp is None: + raise AssertionError( + f"Artifact '{current_art.name}' ({current_art.id}) has no parent process linked and can't be traced back further." ) - history[-1].update( - { - "Input sample ID": input_art.id, - "Input sample UDFs": dict(input_art.udf.items()), - } + + except AssertionError: + if on_fail is not None: + logging.warning( + f"Failed traceback for artifact '{target_art.name}' ({target_art.id}), falling back to on_fail value '{on_fail}'" + ) + return on_fail + else: + raise AssertionError( + f"Could not find matching UDF(s) [{', '.join(target_udfs)}] for artifact {target_art}" ) - for target_udf in target_udfs: - # Don't search inputs of first iteration if use_current is False - if n == 1 and use_current is False: - pass - else: - if target_udf in list_udfs(input_art): - if print_history is True: - return input_art.udf[target_udf], json.dumps( - history, indent=2 - ) - else: - return input_art.udf[target_udf] - - # Cycle to previous step, if possible - try: - pp = input_art.parent_process - assert pp is not None - - pp_tuples = get_art_tuples(pp) - matching_tuples = [] - for pp_tuple in pp_tuples: - try: - pp_input = pp_tuple[0]["uri"] - except: - pp_input = None - try: - pp_output = pp_tuple[1]["uri"] - except: - pp_output = None - - if (pp_input and pp_input.id == input_art.id) or ( - pp_output and pp_output.id == input_art.id - ): - matching_tuples.append(pp_tuple) - - assert ( - len(matching_tuples) == 1 - ), "Target artifact matches multiple inputs/outputs in previous step." - - # Back-tracking successful, re-assign variables to represent previous step - currentStep = pp - art_tuple = matching_tuples[0] - - n += 1 - - except AssertionError: - if isinstance(on_fail, type) and issubclass(on_fail, Exception): - if print_history is True: - print(json.dumps(history, indent=2)) - raise on_fail( - f"Could not find matching UDF(s) [{', '.join(target_udfs)}] for artifact tuple {art_tuple}" - ) - else: - if print_history is True: - print(json.dumps(history, indent=2)) - return on_fail, json.dumps(history, indent=2) - else: - return on_fail diff --git a/scilifelab_epps/zika/utils.py b/scilifelab_epps/zika/utils.py index 26b0b528..89055188 100644 --- a/scilifelab_epps/zika/utils.py +++ b/scilifelab_epps/zika/utils.py @@ -114,7 +114,9 @@ def fetch_sample_data(currentStep: Process, to_fetch: dict) -> pd.DataFrame: except KeyError: row[col_name] = None else: - row[col_name] = fetch_last(currentStep, art_tuple, udf_query) + row[col_name] = fetch_last( + target_art=art_tuple[1]["uri"], target_udfs=udf_query + ) rows.append(row) # Transform to dataframe diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index 91fd6f78..b15056b1 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -44,9 +44,6 @@ def main(args): art for art in process.all_inputs() if art.type == "Analyte" ] - # TODO currently even steps with valid tuples will only use input artifacts - # No traceback provided for output artifact of current step - # Find target output artifacts, if any if no_outputs: logging.info("Step has no output artifacts. Assigning to input artifact.") @@ -67,16 +64,14 @@ def main(args): f"Looking for last recorded UDF '{target_udf}' of {'input' if no_outputs else 'output'} artifact '{target_artifact.name}'..." ) udf_value, udf_history = udf_tools.fetch_last( - currentStep=process, - art=target_artifact, + target_art=target_artifact, target_udfs=target_udf, - use_current=False, - print_history=True, + log_traceback=True, + return_traceback=True, on_fail=None, ) + # TODO collect history for overview of which steps have been pulled from if udf_value: - logging.info(f"Found target UDF '{target_udf}' with value '{udf_value}'") - logging.info(f"Traceback:\n{udf_history}") target_artifact.udf[target_udf] = udf_value target_artifact.put() logging.info( diff --git a/scripts/log_udfs.py b/scripts/log_udfs.py index e42d6af0..a8a202ed 100644 --- a/scripts/log_udfs.py +++ b/scripts/log_udfs.py @@ -37,7 +37,7 @@ def main(lims, args): file_str = None # Parse outputs and their UDFs - if udf_tools.no_outputs(currentStep): + if udf_tools.get_art_tuples(currentStep) == []: arts = [art for art in currentStep.all_inputs() if art.type == "Analyte"] else: arts = [art for art in currentStep.all_outputs() if art.type == "Analyte"] diff --git a/scripts/ont_calc_volumes.py b/scripts/ont_calc_volumes.py index 9271745f..312dcbb1 100644 --- a/scripts/ont_calc_volumes.py +++ b/scripts/ont_calc_volumes.py @@ -45,7 +45,11 @@ def main(lims, args): # Get last known length size_bp, size_bp_history = udf_tools.fetch_last( - currentStep, art_tuple, "Size (bp)", on_fail=None, print_history=True + target_art=art_out, + target_udfs="Size (bp)", + log_traceback=True, + return_traceback=True, + on_fail=None, ) log.append(f"'Size (bp)': {size_bp}\n{size_bp_history}") diff --git a/scripts/ont_update_amount.py b/scripts/ont_update_amount.py index f3c6fbfc..37e430af 100644 --- a/scripts/ont_update_amount.py +++ b/scripts/ont_update_amount.py @@ -44,10 +44,10 @@ def main(lims, args): or "ONT Barcoding" in currentStep.type.name ): size_bp, size_bp_history = udf_tools.fetch_last( - currentStep=currentStep, - art_tuple=art_tuple, + target_art=art_out, target_udfs="Size (bp)", - print_history=True, + log_traceback=True, + return_traceback=True, on_fail=None, ) log.append(f"'Size (bp)': {size_bp}\n{size_bp_history}") From 391c2749564c32165c4d9e6f6cabe0b07104414d Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:23:08 +0100 Subject: [PATCH 13/31] fix so function can return None explicitly --- scilifelab_epps/utils/udf_tools.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index eb5fc2e9..c00ffbed 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -124,7 +124,7 @@ def fetch_last( target_udfs: str | list, log_traceback=False, return_traceback=False, - on_fail=None, + on_fail=AssertionError, ) -> (str | int | float) | tuple[str | int | float, dict]: """Recursively look for target UDF. @@ -227,12 +227,12 @@ def fetch_last( ) except AssertionError: - if on_fail is not None: - logging.warning( - f"Failed traceback for artifact '{target_art.name}' ({target_art.id}), falling back to on_fail value '{on_fail}'" + if isinstance(on_fail, type) and issubclass(on_fail, Exception): + raise on_fail( + f"Could not find matching UDF(s) [{', '.join(target_udfs)}] for artifact {target_art}" ) - return on_fail else: - raise AssertionError( - f"Could not find matching UDF(s) [{', '.join(target_udfs)}] for artifact {target_art}" + logging.warning( + f"Failed traceback for artifact '{target_art.name}' ({target_art.id}), falling back to value '{on_fail}'" ) + return on_fail From cd0ee61194acde1c654e9715f97cfa7c3f5b25f8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:27:31 +0100 Subject: [PATCH 14/31] try displaying steps used on exit 0 --- scripts/fetch_last_known_field.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index b15056b1..b335a33e 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import logging +import sys from argparse import ArgumentParser from datetime import datetime as dt @@ -55,6 +56,7 @@ def main(args): if i["uri"].type == "Analyte" and o["uri"].type == "Analyte" } + steps_used = [] for art_in in arts_in: if no_outputs: target_artifact = art_in @@ -70,8 +72,10 @@ def main(args): return_traceback=True, on_fail=None, ) - # TODO collect history for overview of which steps have been pulled from - if udf_value: + + steps_used.append(udf_history[-1]["Artifact"]["Parent Step"]["Name"]) + + if udf_value is not None: target_artifact.udf[target_udf] = udf_value target_artifact.put() logging.info( @@ -83,6 +87,10 @@ def main(args): ) logging.info(f"Traceback:\n{udf_history}") + sys.stdout( + f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." + ) + if __name__ == "__main__": # Parse args From 565431791272db832284f3899e58855e4bca671a Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:30:20 +0100 Subject: [PATCH 15/31] bugfix --- scripts/fetch_last_known_field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index b335a33e..2369e538 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -38,7 +38,7 @@ def main(args): ), f"No target UDF supplied from step field '{args.step_udf}'" # Check whether process has output artifacts, not the case for e.g. QC steps - no_outputs: bool = udf_tools.no_outputs(process) + no_outputs: bool = True if udf_tools.get_art_tuples(process) == [] else False # Load input artifacts arts_in: list[Artifact] = [ From e97f89691ab0fd16a230f1aad4ee91873119f131 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:34:25 +0100 Subject: [PATCH 16/31] try stderr --- scripts/fetch_last_known_field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index 2369e538..2f14d022 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -87,7 +87,7 @@ def main(args): ) logging.info(f"Traceback:\n{udf_history}") - sys.stdout( + sys.stderr( f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." ) From 87a3a0cdadba79811ca10119ef32458527c7a9e8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:36:32 +0100 Subject: [PATCH 17/31] try w/o --- scripts/fetch_last_known_field.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index 2f14d022..b8ec3e5f 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -87,9 +87,8 @@ def main(args): ) logging.info(f"Traceback:\n{udf_history}") - sys.stderr( - f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." - ) + # TODO use variable + msg = f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." if __name__ == "__main__": From 94750dda22485e1502af5409820a1bafa5a2aa50 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 7 Jan 2025 16:42:26 +0100 Subject: [PATCH 18/31] rename and change banner approach --- scripts/fetch_last_known_field.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index b8ec3e5f..ecb796e1 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -65,7 +65,7 @@ def main(args): logging.info( f"Looking for last recorded UDF '{target_udf}' of {'input' if no_outputs else 'output'} artifact '{target_artifact.name}'..." ) - udf_value, udf_history = udf_tools.fetch_last( + udf_value, traceback = udf_tools.fetch_last( target_art=target_artifact, target_udfs=target_udf, log_traceback=True, @@ -73,7 +73,10 @@ def main(args): on_fail=None, ) - steps_used.append(udf_history[-1]["Artifact"]["Parent Step"]["Name"]) + steps_used.append( + f"'{traceback[-1]['Artifact']['Parent Step']['Name']}'" + + f" ({traceback[-1]['Artifact']['Parent Step']['ID']})" + ) if udf_value is not None: target_artifact.udf[target_udf] = udf_value @@ -85,10 +88,11 @@ def main(args): logging.warning( f"Could not traceback UDF '{target_udf}' for {'input' if no_outputs else 'output'} artifact '{target_artifact.name}'" ) - logging.info(f"Traceback:\n{udf_history}") + logging.info(f"Traceback:\n{traceback}") - # TODO use variable - msg = f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." + logging.warning( + f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." + ) if __name__ == "__main__": From e79603468f8e9b641bb14892e1b33a89aba701ec Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 12:43:20 +0100 Subject: [PATCH 19/31] test exit 0 message --- scilifelab_epps/wrapper.py | 1 + scripts/fetch_last_known_field.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/scilifelab_epps/wrapper.py b/scilifelab_epps/wrapper.py index 5380fbee..dade9c03 100644 --- a/scilifelab_epps/wrapper.py +++ b/scilifelab_epps/wrapper.py @@ -97,6 +97,7 @@ def epp_wrapper(args): ) sys.exit(2) else: + print("Test print") sys.exit(0) return epp_wrapper diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index ecb796e1..a687c6f1 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -90,9 +90,10 @@ def main(args): ) logging.info(f"Traceback:\n{traceback}") - logging.warning( - f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." - ) + # Look into exit 0 with message + # - print + # - stderr + # - sys.exit("blabla") if __name__ == "__main__": From f778f39829af6c59b076a6431b7e966cd0e0713e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 12:45:25 +0100 Subject: [PATCH 20/31] prev --- scilifelab_epps/wrapper.py | 1 - scripts/fetch_last_known_field.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/scilifelab_epps/wrapper.py b/scilifelab_epps/wrapper.py index dade9c03..5380fbee 100644 --- a/scilifelab_epps/wrapper.py +++ b/scilifelab_epps/wrapper.py @@ -97,7 +97,6 @@ def epp_wrapper(args): ) sys.exit(2) else: - print("Test print") sys.exit(0) return epp_wrapper diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index a687c6f1..20715e51 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -94,6 +94,7 @@ def main(args): # - print # - stderr # - sys.exit("blabla") + print("Test print within script scope") if __name__ == "__main__": From 38a1ee5dbdcf215da2ba3567f1ff9865b27321ff Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 12:47:16 +0100 Subject: [PATCH 21/31] prev --- scripts/fetch_last_known_field.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index 20715e51..a213e52f 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -90,11 +90,10 @@ def main(args): ) logging.info(f"Traceback:\n{traceback}") - # Look into exit 0 with message - # - print - # - stderr - # - sys.exit("blabla") - print("Test print within script scope") + # Write to stdout for the green banner + print( + f"UDF '{target_udf}' pulled from steps: {' ,'.join(set(steps_used))}. Please double check the values." + ) if __name__ == "__main__": From 16423cb5d7157e43eeeed963055355e740cd08f2 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 14:23:08 +0100 Subject: [PATCH 22/31] Add a very useful utility function to check for unpopulated UDFs and throw appropriate warning --- scilifelab_epps/utils/udf_tools.py | 57 ++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index c00ffbed..2f2ce46d 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -1,7 +1,7 @@ import json import logging from typing import Union - +import xml.etree.ElementTree as ET from genologics.entities import Artifact, Process from requests.exceptions import HTTPError @@ -10,6 +10,32 @@ """ +def process_has_udfs(process: Process, target_udfs: list[str]) -> list[str]: + """Check whether any target UDFs are present in the sample fields of the process associated type. + + This function is necessary because a non-required sample UDF left blank will not be detected in the artifact object. + + Returns a list of found UDFs, or an empty list if none were found. + """ + + # Get the raw xml of the process associated type + raw_xml = process.type.xml() + + # Parse as tree object + root = ET.fromstring(raw_xml) + + # Instantiate return object + target_udfs_found = [] + + # Check whether the target UDF is present in the sample fields + for sample_field in root.iter("sample-field"): + for target_udf in target_udfs: + if sample_field.attrib["name"] == target_udf: + target_udfs_found.append(target_udf) + + return target_udfs_found + + def put(target: Artifact | Process, target_udf: str, val, on_fail=AssertionError): """Try to put UDF on artifact or process, optionally without causing fatal error. Evaluates true on success and error (default) or on_fail param on failure. @@ -130,14 +156,15 @@ def fetch_last( Arguments: - target_art Artifact to traceback and assign UDF value to. + target_art Artifact to traceback. Any target UDFs already present in this artifact will be ignored. - target_udfs Can be supplied as a string, or as a prioritized + target_udfs The UDF(s) to look for. Can be supplied as a string, or as a prioritized list of strings. log_traceback If True, will log the full traceback. - return_traceback If True, will return the traceback too. + return_traceback If False, will return only UDF value. + If True, will also return the traceback as a dict. on_fail If not None, will return this value on failure. """ @@ -179,15 +206,19 @@ def fetch_last( if pp_art_tuples != []: for pp_tuple in pp_art_tuples: if pp_tuple[1]["uri"].id == current_art.id: + # Dynamically reassign current artifact current_art = pp_tuple[0]["uri"] break - # If not, TODO else: - raise NotImplementedError() + raise NotImplementedError("Parent process has no valid input-output links, traceback can't continue.") + # Dynamically reassign parent process pp = current_art.parent_process + + # Keep track of visited parent processes if pp is not None: steps_visited.append(f"'{pp.type.name}' ({pp.id})") + target_udfs_in_parent_process = process_has_udfs(pp, target_udfs) traceback.append( { @@ -221,6 +252,15 @@ def fetch_last( else: return current_art.udf[target_udf] + # Address the case that no target UDFs were found on the artifact, even though they were present in the parent process + if target_udfs_in_parent_process != []: + logging.warning( + f"Parent process '{pp.type.name}' ({pp.id})" + + f" has target UDF(s) {target_udfs_in_parent_process}," + + f" but it's not filled in for artifact '{current_art}' ({current_art.id})." + + f" Please double check that you haven't missed filling it in.") + + # Stop traceback if no parent process is found if pp is None: raise AssertionError( f"Artifact '{current_art.name}' ({current_art.id}) has no parent process linked and can't be traced back further." @@ -235,4 +275,7 @@ def fetch_last( logging.warning( f"Failed traceback for artifact '{target_art.name}' ({target_art.id}), falling back to value '{on_fail}'" ) - return on_fail + if return_traceback: + return on_fail, traceback + else: + return on_fail From 58a22f8f4c4f8c0a2e099b67ea5480bc498fe928 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 14:27:30 +0100 Subject: [PATCH 23/31] improve logs --- scilifelab_epps/utils/udf_tools.py | 2 +- scripts/fetch_last_known_field.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 2f2ce46d..b74e666e 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -257,7 +257,7 @@ def fetch_last( logging.warning( f"Parent process '{pp.type.name}' ({pp.id})" + f" has target UDF(s) {target_udfs_in_parent_process}," - + f" but it's not filled in for artifact '{current_art}' ({current_art.id})." + + f" but it's not filled in for artifact '{current_art.name}' ({current_art.id})." + f" Please double check that you haven't missed filling it in.") # Stop traceback if no parent process is found diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index a213e52f..db6ff54d 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -47,7 +47,7 @@ def main(args): # Find target output artifacts, if any if no_outputs: - logging.info("Step has no output artifacts. Assigning to input artifact.") + logging.info("Step has no output artifacts. Assigning to input artifacts.") else: art_tuples: list[tuple[dict]] = process.input_output_maps art_in2out: dict[str:Artifact] = { From 29201dfeae1264fca03f245534d39b802880368b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 14:28:45 +0100 Subject: [PATCH 24/31] ruff --- scilifelab_epps/utils/udf_tools.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index b74e666e..92d600f9 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -210,7 +210,9 @@ def fetch_last( current_art = pp_tuple[0]["uri"] break else: - raise NotImplementedError("Parent process has no valid input-output links, traceback can't continue.") + raise NotImplementedError( + "Parent process has no valid input-output links, traceback can't continue." + ) # Dynamically reassign parent process pp = current_art.parent_process @@ -258,7 +260,8 @@ def fetch_last( f"Parent process '{pp.type.name}' ({pp.id})" + f" has target UDF(s) {target_udfs_in_parent_process}," + f" but it's not filled in for artifact '{current_art.name}' ({current_art.id})." - + f" Please double check that you haven't missed filling it in.") + + f" Please double check that you haven't missed filling it in." + ) # Stop traceback if no parent process is found if pp is None: From 8d5b95e465dbc8993c383ec2ec3bba91c153e19a Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 14:29:18 +0100 Subject: [PATCH 25/31] ruff check fixes --- scilifelab_epps/utils/udf_tools.py | 5 +++-- scripts/fetch_last_known_field.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 92d600f9..09292a09 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -1,7 +1,8 @@ import json import logging -from typing import Union import xml.etree.ElementTree as ET +from typing import Union + from genologics.entities import Artifact, Process from requests.exceptions import HTTPError @@ -260,7 +261,7 @@ def fetch_last( f"Parent process '{pp.type.name}' ({pp.id})" + f" has target UDF(s) {target_udfs_in_parent_process}," + f" but it's not filled in for artifact '{current_art.name}' ({current_art.id})." - + f" Please double check that you haven't missed filling it in." + + " Please double check that you haven't missed filling it in." ) # Stop traceback if no parent process is found diff --git a/scripts/fetch_last_known_field.py b/scripts/fetch_last_known_field.py index db6ff54d..40115735 100644 --- a/scripts/fetch_last_known_field.py +++ b/scripts/fetch_last_known_field.py @@ -1,6 +1,5 @@ #!/usr/bin/env python import logging -import sys from argparse import ArgumentParser from datetime import datetime as dt From a5ab60cd0915429024609971dcc209ce9b2b8056 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 14:35:11 +0100 Subject: [PATCH 26/31] bump docstring --- scilifelab_epps/utils/udf_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 09292a09..9a19c6a4 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -164,10 +164,10 @@ def fetch_last( log_traceback If True, will log the full traceback. - return_traceback If False, will return only UDF value. - If True, will also return the traceback as a dict. + return_traceback If True, will additionally return the traceback as a dict. - on_fail If not None, will return this value on failure. + on_fail If this is a subclass of Exception, will raise this exception on failure. + If not, will return this value on failure instead of the UDF value. """ # Convert to list, to enable iteration From 0b5913e9bbcb89a00c5555087dea025abcbd39d5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 14:57:18 +0100 Subject: [PATCH 27/31] fix log --- scilifelab_epps/utils/udf_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 9a19c6a4..898d7a74 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -273,7 +273,7 @@ def fetch_last( except AssertionError: if isinstance(on_fail, type) and issubclass(on_fail, Exception): raise on_fail( - f"Could not find matching UDF(s) [{', '.join(target_udfs)}] for artifact {target_art}" + f"Could not find matching UDF(s) [{', '.join(target_udfs)}] for artifact '{target_art.name}' ({target_art.id})" ) else: logging.warning( From b8e314e9ac8849ce8e6a4d16a1a101f8d8ba9a2e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 15:36:58 +0100 Subject: [PATCH 28/31] fix issue with ont pooling, add argument to allow fetching from target artifact, refactor loop --- scilifelab_epps/utils/udf_tools.py | 104 +++++++++++++++-------------- scilifelab_epps/zika/utils.py | 4 +- 2 files changed, 58 insertions(+), 50 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index 898d7a74..b54c22ea 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -149,6 +149,7 @@ def list_udfs(art: Artifact) -> list: def fetch_last( target_art: Artifact, target_udfs: str | list, + include_current=False, log_traceback=False, return_traceback=False, on_fail=AssertionError, @@ -157,11 +158,13 @@ def fetch_last( Arguments: - target_art Artifact to traceback. Any target UDFs already present in this artifact will be ignored. + target_art Artifact to traceback. target_udfs The UDF(s) to look for. Can be supplied as a string, or as a prioritized list of strings. + include_current If True, will pull target UDFs if found in the target artifact. + log_traceback If True, will log the full traceback. return_traceback If True, will additionally return the traceback as a dict. @@ -178,43 +181,12 @@ def fetch_last( traceback = [] steps_visited = [] + # Instantiate recursion variables + current_art = target_art + n = 1 try: - # First iteration, current artifact is the target artifact. Don't pull any UDF values. - current_art = target_art - pp = current_art.parent_process - assert pp, f"Artifact '{current_art.name}' ({current_art.id}) has no parent process linked." - steps_visited.append(f"'{pp.type.name}' ({pp.id})") - - traceback.append( - { - "Artifact": { - "Name": current_art.name, - "ID": current_art.id, - "UDFs": dict(current_art.udf.items()), - "Parent Step": { - "Name": pp.type.name if pp else None, - "ID": pp.id if pp else None, - }, - } - } - ) - # Start recursive search while True: - pp_art_tuples = get_art_tuples(pp) - - # If parent process has valid input-output tuples, use for linkage - if pp_art_tuples != []: - for pp_tuple in pp_art_tuples: - if pp_tuple[1]["uri"].id == current_art.id: - # Dynamically reassign current artifact - current_art = pp_tuple[0]["uri"] - break - else: - raise NotImplementedError( - "Parent process has no valid input-output links, traceback can't continue." - ) - # Dynamically reassign parent process pp = current_art.parent_process @@ -240,23 +212,30 @@ def fetch_last( # Search for correct UDF for target_udf in target_udfs: if target_udf in list_udfs(current_art): - if log_traceback is True: - logging.info(f"Traceback:\n{json.dumps(traceback, indent=2)}") - logging.info( - f"Found target UDF '{target_udf}'" - + f" with value '{current_art.udf[target_udf]}'" - + f" in process {steps_visited[-1]}" - + f" {'output' if pp else 'input'}" - + f" artifact '{current_art.name}' ({current_art.id})" - ) - - if return_traceback: - return current_art.udf[target_udf], traceback + if include_current is not True and n == 1: + logging.info( + "Target UDF was found in specified artifact, but include_current is set to False. Skipping." + ) else: - return current_art.udf[target_udf] + if log_traceback is True: + logging.info( + f"Traceback:\n{json.dumps(traceback, indent=2)}" + ) + logging.info( + f"Found target UDF '{target_udf}'" + + f" with value '{current_art.udf[target_udf]}'" + + f" in process {steps_visited[-1]}" + + f" {'output' if pp else 'input'}" + + f" artifact '{current_art.name}' ({current_art.id})" + ) + + if return_traceback: + return current_art.udf[target_udf], traceback + else: + return current_art.udf[target_udf] # Address the case that no target UDFs were found on the artifact, even though they were present in the parent process - if target_udfs_in_parent_process != []: + if pp is not None and target_udfs_in_parent_process != []: logging.warning( f"Parent process '{pp.type.name}' ({pp.id})" + f" has target UDF(s) {target_udfs_in_parent_process}," @@ -270,6 +249,33 @@ def fetch_last( f"Artifact '{current_art.name}' ({current_art.id}) has no parent process linked and can't be traced back further." ) + pp_art_tuples = get_art_tuples(pp) + + # If parent process has valid input-output tuples, use for linkage + linked_input_arts = [] + if pp_art_tuples != []: + for pp_tuple in pp_art_tuples: + if pp_tuple[1]["uri"].id == current_art.id: + linked_input_arts.append(pp_tuple[0]["uri"]) + else: + raise NotImplementedError( + "Parent process has no valid input-output links, traceback can't continue." + ) + + if len(linked_input_arts) == 1: + # Dynamically reassign current artifact + current_art = linked_input_arts[0] + elif len(linked_input_arts) > 1: + raise AssertionError( + "Parent process has multiple input artifacts linked to the same output artifact, can't traceback." + ) + else: + raise AssertionError( + "Parent process has no input artifacts linked to the output artifact, can't traceback." + ) + + n += 1 + except AssertionError: if isinstance(on_fail, type) and issubclass(on_fail, Exception): raise on_fail( diff --git a/scilifelab_epps/zika/utils.py b/scilifelab_epps/zika/utils.py index 89055188..41cbfeee 100644 --- a/scilifelab_epps/zika/utils.py +++ b/scilifelab_epps/zika/utils.py @@ -115,7 +115,9 @@ def fetch_sample_data(currentStep: Process, to_fetch: dict) -> pd.DataFrame: row[col_name] = None else: row[col_name] = fetch_last( - target_art=art_tuple[1]["uri"], target_udfs=udf_query + target_art=art_tuple[0]["uri"], + target_udfs=udf_query, + include_current=True, ) rows.append(row) From b5ad5846127b092d4fae85352ad71f8fb2c91549 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 15:52:34 +0100 Subject: [PATCH 29/31] remove superfluous handling of traceback --- .../calc_from_args/udf_arg_methods.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/scilifelab_epps/calc_from_args/udf_arg_methods.py b/scilifelab_epps/calc_from_args/udf_arg_methods.py index b7a5defd..d2bbb9f4 100644 --- a/scilifelab_epps/calc_from_args/udf_arg_methods.py +++ b/scilifelab_epps/calc_from_args/udf_arg_methods.py @@ -23,7 +23,6 @@ def fetch_from_arg( """ - history: str | None = None source: Artifact | Process source_name: str @@ -49,11 +48,9 @@ def fetch_from_arg( if arg_dict["recursive"]: # Fetch UDF recursively - value, history = udf_tools.fetch_last( + value = udf_tools.fetch_last( target_art=source, target_udfs=arg_dict["udf"], - log_traceback=True, - return_traceback=True, ) else: # Fetch UDF from input or output artifact @@ -72,17 +69,6 @@ def fetch_from_arg( else: return on_fail - # Log what has been done - log_str = f"Fetched UDF '{arg_dict['udf']}': {value} from {arg_dict['source']} '{source_name}'." - - if history: - history_yaml = yaml.load(history, Loader=yaml.FullLoader) - last_step_name = history_yaml[-1]["Step name"] - last_step_id = history_yaml[-1]["Step ID"] - log_str += f"\n\tUDF recusively fetched from step: '{last_step_name}' (ID: '{last_step_id}')" - - logging.info(log_str) - return value From b4165f9f0fd85c0494fe6777f1e0058ff6d4c712 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 15:54:46 +0100 Subject: [PATCH 30/31] ruff fixes --- scilifelab_epps/calc_from_args/udf_arg_methods.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scilifelab_epps/calc_from_args/udf_arg_methods.py b/scilifelab_epps/calc_from_args/udf_arg_methods.py index d2bbb9f4..77ef5236 100644 --- a/scilifelab_epps/calc_from_args/udf_arg_methods.py +++ b/scilifelab_epps/calc_from_args/udf_arg_methods.py @@ -1,8 +1,6 @@ #!/usr/bin/env python -import logging from typing import Any -import yaml from genologics.entities import Artifact, Process from scilifelab_epps.utils import udf_tools From 5426656eb55e086644bd031cc169b397b3561b3b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 8 Jan 2025 15:56:18 +0100 Subject: [PATCH 31/31] shut up mypy --- scilifelab_epps/utils/udf_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scilifelab_epps/utils/udf_tools.py b/scilifelab_epps/utils/udf_tools.py index b54c22ea..fec53764 100644 --- a/scilifelab_epps/utils/udf_tools.py +++ b/scilifelab_epps/utils/udf_tools.py @@ -1,7 +1,7 @@ import json import logging import xml.etree.ElementTree as ET -from typing import Union +from typing import Any, Union from genologics.entities import Artifact, Process from requests.exceptions import HTTPError @@ -153,7 +153,7 @@ def fetch_last( log_traceback=False, return_traceback=False, on_fail=AssertionError, -) -> (str | int | float) | tuple[str | int | float, dict]: +) -> Any | tuple[Any, dict]: """Recursively look for target UDF. Arguments: