From 5ea79c9b4dca37a64435e4b327b6bfa54c432e51 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 5 Jul 2023 16:21:15 +0300 Subject: [PATCH 01/29] Refactor unneeded variable naming --- src/services/extract_matching_cases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/services/extract_matching_cases.py b/src/services/extract_matching_cases.py index 11324df..1763ecf 100644 --- a/src/services/extract_matching_cases.py +++ b/src/services/extract_matching_cases.py @@ -15,9 +15,9 @@ def __init__(self, offset_results: dict, offset_range: tuple, reference_db): def extract_candidates_matching_selected_offset(self) -> dict: extracted_candidates = {} - for key, value in self.offset_results.items(): + for transcript_id, value in self.offset_results.items(): strand, offsets = value["strand"], value["offsets"] - reference_id, transcript_id = value["reference_id"], key + reference_id = value["reference_id"] if strand == "-": offsets.reverse() for offset_exon_idx in range(1, len(offsets)-1): From 49e935aabf57a489cccc8702110b3974a5d033a6 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 5 Jul 2023 16:25:23 +0300 Subject: [PATCH 02/29] Skip exons with no number --- src/services/extract_matching_cases.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/services/extract_matching_cases.py b/src/services/extract_matching_cases.py index 1763ecf..67b8d28 100644 --- a/src/services/extract_matching_cases.py +++ b/src/services/extract_matching_cases.py @@ -28,6 +28,8 @@ def extract_candidates_matching_selected_offset(self) -> dict: str(offset_exon_number) + ".start" + \ '.offset_' + str(offsets[offset_exon_idx][0]) for exon in self.reference_db.children(reference_id, featuretype='exon'): + if 'exon_number' not in exon.attributes: + continue if int(exon['exon_number'][0]) == offset_exon_number: extracted_candidates[entry_key] = { "transcript_id": transcript_id, @@ -43,6 +45,8 @@ def extract_candidates_matching_selected_offset(self) -> dict: str(offset_exon_number) + ".end" + \ '.offset_' + str(offsets[offset_exon_idx][1]) for exon in self.reference_db.children(reference_id, featuretype='exon'): + if 'exon_number' not in exon.attributes: + continue if int(exon['exon_number'][0]) == offset_exon_number: extracted_candidates[entry_key] = { "transcript_id": transcript_id, From c8dee5778072b21fc6ee12daadbd1446ab40fdaa Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 5 Jul 2023 16:37:06 +0300 Subject: [PATCH 03/29] skip cases with no values when making graphs --- src/services/graph_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/services/graph_manager.py b/src/services/graph_manager.py index e9b00e0..519cb58 100644 --- a/src/services/graph_manager.py +++ b/src/services/graph_manager.py @@ -21,6 +21,8 @@ def construct_bar_chart_from_dict(self, title: str, x_label: str, y_label: str): + if not len(graph_values): + return normalized_graph_values = self.normalize_values(graph_values) values = sorted(normalized_graph_values.items()) x_values, y_values = zip(*values) From 1961d83b0aba871eb63b15e969d27b03b66bf68e Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Wed, 5 Jul 2023 17:06:13 +0300 Subject: [PATCH 04/29] add log-folder --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index af94810..99738b4 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ offsets.txt htmlcov/ logs/ dx1-logs/ +dx3-logs/ From 114d2ffcc645a591f0d552ed31b67eb1b5db4efc Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 09:00:14 +0300 Subject: [PATCH 05/29] Remove the use of attributes with gffutils --- src/services/extract_matching_cases.py | 39 +++++++++++--------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/services/extract_matching_cases.py b/src/services/extract_matching_cases.py index 67b8d28..ee5d8b4 100644 --- a/src/services/extract_matching_cases.py +++ b/src/services/extract_matching_cases.py @@ -2,7 +2,8 @@ class MatchingCasesExtractor: def __init__(self, offset_results: dict, offset_range: tuple, reference_db): """ - Extract candidates matching the selected offset. + Extract candidates matching the selected offset. Currently first and last exon are not considered. + These are omitted with the magic numbers in the second for loop. Args: offset_results (dict): a dictionary containing the offset results for each transcript @@ -16,42 +17,36 @@ def __init__(self, offset_results: dict, offset_range: tuple, reference_db): def extract_candidates_matching_selected_offset(self) -> dict: extracted_candidates = {} for transcript_id, value in self.offset_results.items(): - strand, offsets = value["strand"], value["offsets"] - reference_id = value["reference_id"] - if strand == "-": - offsets.reverse() + strand, offsets, reference_id = value["strand"], value["offsets"], value["reference_id"] for offset_exon_idx in range(1, len(offsets)-1): - offset_exon_number = offset_exon_idx + 1 - if abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and \ - abs(offsets[offset_exon_idx][0]) <= self.offset_range[1]: + start_is_in_range = bool(abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and + abs(offsets[offset_exon_idx][0]) <= self.offset_range[1]) + if start_is_in_range: entry_key = transcript_id + ".exon_" + \ - str(offset_exon_number) + ".start" + \ + str(offset_exon_idx) + ".start" + \ '.offset_' + str(offsets[offset_exon_idx][0]) - for exon in self.reference_db.children(reference_id, featuretype='exon'): - if 'exon_number' not in exon.attributes: - continue - if int(exon['exon_number'][0]) == offset_exon_number: + for exon_number, exon in enumerate(self.reference_db.children(reference_id, featuretype='exon', order_by='start')): + if exon_number == offset_exon_idx: extracted_candidates[entry_key] = { "transcript_id": transcript_id, "strand": strand, - "exon_number": offset_exon_number, + "exon_number": offset_exon_idx, "location_type": "start", "location": exon.start + offsets[offset_exon_idx][0], "offset": offsets[offset_exon_idx][0] } - if abs(offsets[offset_exon_idx][1]) >= self.offset_range[0] and \ - abs(offsets[offset_exon_idx][1]) <= self.offset_range[1]: + end_is_in_range = bool(abs(offsets[offset_exon_idx][1]) >= self.offset_range[0] and + abs(offsets[offset_exon_idx][1]) <= self.offset_range[1]) + if end_is_in_range: entry_key = transcript_id + ".exon_" + \ - str(offset_exon_number) + ".end" + \ + str(offset_exon_idx) + ".end" + \ '.offset_' + str(offsets[offset_exon_idx][1]) - for exon in self.reference_db.children(reference_id, featuretype='exon'): - if 'exon_number' not in exon.attributes: - continue - if int(exon['exon_number'][0]) == offset_exon_number: + for exon_number, exon in enumerate(self.reference_db.children(reference_id, featuretype='exon', order_by='start')): + if exon_number == offset_exon_idx: extracted_candidates[entry_key] = { "transcript_id": transcript_id, "strand": strand, - "exon_number": offset_exon_number, + "exon_number": offset_exon_idx, "location_type": "end", "location": exon.end + offsets[offset_exon_idx][1], "offset": offsets[offset_exon_idx][1] From 41f30b2cfdf7ffa7369c912d147699d8e9a78434 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:00:59 +0300 Subject: [PATCH 06/29] Add min threshold for img cases --- src/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/config.py b/src/config.py index bfb317a..72a252b 100644 --- a/src/config.py +++ b/src/config.py @@ -30,6 +30,7 @@ CIGAR_RESULTS_LOG = os.path.join(LOG_FILE_DIR, cigar_results) TITLE_FILE_LENGTH = os.getenv("TITLE_FILE_LENGTH") or 50 DEFAULT_WINDOW_SIZE = os.getenv("DEFAULT_WINDOW_SIZE") or 8 +CREATE_IMG_N_TRESHOLD = os.getenv("CREATE_IMG_N_TRESHOLD") or 100 if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) From 58dfef7e7339f42883075b593a05e292ea3d01da Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:01:12 +0300 Subject: [PATCH 07/29] Add default values, add img-treshold --- src/services/argument_parser.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/services/argument_parser.py b/src/services/argument_parser.py index 76ee974..ad2e5c7 100644 --- a/src/services/argument_parser.py +++ b/src/services/argument_parser.py @@ -2,12 +2,13 @@ import json from config import DEFAULT_WINDOW_SIZE +from config import CREATE_IMG_N_TRESHOLD def init_argparser(): parser = argparse.ArgumentParser( description='compAna: a tool for comparing annotations', - usage='python3 compAna.py -i [-f] [-s]' + usage='python3 src/compana.py -g -r -a [-o ] [-f] [-s] [-c ] [-j ] [-b ] [-t ] [-w ] [-e]' ) parser.add_argument( '-g', '--gffcompare_gtf', @@ -56,11 +57,16 @@ def init_argparser(): parser.add_argument( '-w', '--window_size', help='window from which indels and mismatches are to be searched in interesting locations', - metavar='') + metavar='', + default=DEFAULT_WINDOW_SIZE) parser.add_argument( '-e', '--extended_debug', help='enable extended debug output', action='store_true') + parser.add_argument( + '-i', '--img_n_threshold', + help='threshold for the n of cases for creating images', + default=CREATE_IMG_N_TRESHOLD) parser_args = parser.parse_args() parser_dict = vars(parser_args) @@ -90,7 +96,4 @@ def init_argparser(): max(0, offset_range[0]), max(0, offset_range[1]) ) - if not parser_args.window_size: - parser_dict["window_size"] = DEFAULT_WINDOW_SIZE - return parser_args From cde3f4f245866c4db4354472cc4dbc7032021339 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:01:43 +0300 Subject: [PATCH 08/29] create skeleton for closest canonicals validation for imgs --- src/services/log_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 6d8539a..a704e84 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -59,6 +59,11 @@ def compute_indel_results(self): }) return indel_results + def validate_img_should_be_created_for_closest_canonical_dict_entry(self, key: tuple, nucleotide_pair: tuple, cases: dict): + if sum(cases.values()) < 100: + return False + pass + def generate_closest_canonicals_graphs(self, dict_of_canonicals: dict): output_manager.output_line({ "line": "Creating graphs for closest canonicals", From 451ff899a8db775f64457e0cbab3faf2051d5a50 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:27:26 +0300 Subject: [PATCH 09/29] Update template for args --- arguments_template.json | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arguments_template.json b/arguments_template.json index 1ed2c86..5dc65c7 100644 --- a/arguments_template.json +++ b/arguments_template.json @@ -4,9 +4,10 @@ "reference_gtf": "", "reference_fasta": "", "gffcompare_gtf": "", - "offset": 0, - "class_code": "j c k", - "force": false, + "offset": [0, 6], + "class_code": "j c k s x", "window_size": 8, + "min_reads_for_graph": 100, + "force": false, "extended_debugging": false } \ No newline at end of file From 8a8c8e493f5d5d4700b33e09bed575c93e31e84c Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:27:43 +0300 Subject: [PATCH 10/29] rename argument to be more descriptive --- src/services/argument_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/services/argument_parser.py b/src/services/argument_parser.py index ad2e5c7..5abed91 100644 --- a/src/services/argument_parser.py +++ b/src/services/argument_parser.py @@ -64,7 +64,7 @@ def init_argparser(): help='enable extended debug output', action='store_true') parser.add_argument( - '-i', '--img_n_threshold', + '-m', '--min_reads_for_graph', help='threshold for the n of cases for creating images', default=CREATE_IMG_N_TRESHOLD) From 5c08346447dfda3dd0bd0a4a2d9d34c7abd1d0f2 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:28:00 +0300 Subject: [PATCH 11/29] Add validation for closest canonicals imgs --- src/services/log_manager.py | 40 +++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index a704e84..92a1790 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -59,23 +59,31 @@ def compute_indel_results(self): }) return indel_results - def validate_img_should_be_created_for_closest_canonical_dict_entry(self, key: tuple, nucleotide_pair: tuple, cases: dict): - if sum(cases.values()) < 100: + def validate_img_should_be_created_for_closest_canonical_dict_entry(self, parser_args, key: tuple, nucleotide_pair: tuple, cases: dict): + if sum(cases.values()) < int(parser_args.min_reads_for_graph): return False - pass + acceptor_site_canonicals = ["AG", "AC"] + donor_site_canonicals = ["GT", "GC", "AT"] + if key[1] == 'start' and nucleotide_pair[1] not in acceptor_site_canonicals: + return False + if key[1] == 'end' and nucleotide_pair[1] not in donor_site_canonicals: + return False + return True - def generate_closest_canonicals_graphs(self, dict_of_canonicals: dict): + def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: dict): output_manager.output_line({ "line": "Creating graphs for closest canonicals", "is_info": True }) - minimum_case_count = 100 for key, nucleotides in dict_of_canonicals.items(): for nucleotide_pair, cases in nucleotides.items(): - case_count = sum(cases.values()) - if case_count < minimum_case_count: + if not self.validate_img_should_be_created_for_closest_canonical_dict_entry( + parser_args, key, nucleotide_pair, cases + ): continue + case_count = sum(cases.values()) + title = f"{nucleotide_pair}: strand: {key[0]}, exon location: {key[1]}, offset: {key[2]}, direction of pair: {key[3]}, n of cases: {case_count}" filename = "closest-canonicals." + ".strand_" + str(key[0]) + ".exon-loc-" + \ str(key[1]) + ".offset-(" + str(key[2]) + ")" + \ @@ -148,9 +156,7 @@ def compute_closest_canonicals_dict(self): return closest_canonicals_dict - def generate_json_overview_dict_for_closest_canonicals(self): - closest_canonicals_dict = self.compute_closest_canonicals_dict() - self.generate_closest_canonicals_graphs(closest_canonicals_dict) + def generate_json_overview_dict_for_closest_canonicals(self, closest_canonicals_dict: dict): json_overview = {} for key, canonical_values in closest_canonicals_dict.items(): @@ -160,8 +166,7 @@ def generate_json_overview_dict_for_closest_canonicals(self): canonical_values[item]) return json_overview - def write_closest_canonicals_log_to_file(self, parser_args): - json_overview = self.generate_json_overview_dict_for_closest_canonicals() + def write_closest_canonicals_log_to_file(self, parser_args, json_overview: dict): with open(FASTA_OVERVIEW_FILE, "w", encoding="utf-8") as file: file.write("# Overview\n") @@ -258,6 +263,15 @@ def write_debug_files(self): "is_info": True }) + def generate_output_for_closest_canonicals(self, parser_args): + closest_canonicals_dict = self.compute_closest_canonicals_dict() + closest_canonicals_json_dict = self.generate_json_overview_dict_for_closest_canonicals( + closest_canonicals_dict) + self.generate_closest_canonicals_graphs( + parser_args, closest_canonicals_dict) + self.write_closest_canonicals_log_to_file( + parser_args, closest_canonicals_json_dict) + def execute_log_file_creation(self, matching_cases_dict: dict, parser_args): output_manager.output_line({ @@ -267,7 +281,7 @@ def execute_log_file_creation(self, matching_cases_dict: dict, parser_args): self.matching_cases_dict = matching_cases_dict - self.write_closest_canonicals_log_to_file(parser_args) + self.generate_output_for_closest_canonicals(parser_args) self.generate_indel_graphs() if parser_args.extended_debug: From 6afa7ce4bb087451e993f12379055cd31650debf Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 10:29:02 +0300 Subject: [PATCH 12/29] Accept lower case canonicals in validation --- src/services/log_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 92a1790..32179b2 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -64,9 +64,9 @@ def validate_img_should_be_created_for_closest_canonical_dict_entry(self, parser return False acceptor_site_canonicals = ["AG", "AC"] donor_site_canonicals = ["GT", "GC", "AT"] - if key[1] == 'start' and nucleotide_pair[1] not in acceptor_site_canonicals: + if key[1] == 'start' and str(nucleotide_pair[1]).upper() not in acceptor_site_canonicals: return False - if key[1] == 'end' and nucleotide_pair[1] not in donor_site_canonicals: + if key[1] == 'end' and str(nucleotide_pair[1]).upper() not in donor_site_canonicals: return False return True From 3d0b7e698006ce10b5ca229bd4a14051db7560c4 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 11:10:13 +0300 Subject: [PATCH 13/29] Rename offset-log file --- src/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.py b/src/config.py index 72a252b..3c524d7 100644 --- a/src/config.py +++ b/src/config.py @@ -10,7 +10,7 @@ print("Warning! .env file not found.") temporary_dir_path = os.getenv("TEMPORARY_DIR") or "temporary_files" -offset_log = os.getenv("OFFSET_LOG") or "offset_log.txt" +offset_log = os.getenv("OFFSET_LOG_FILE") or "debug_offsets.log" test_file_directory = os.getenv( "TEST_FILE_DIRECTORY") or "src/tests/files" From 6df55a8e45fe50d51368479362ca147efb102381 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 11:10:28 +0300 Subject: [PATCH 14/29] Improve stdout. Move indel-results to file --- src/services/log_manager.py | 50 ++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 32179b2..9649527 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -75,6 +75,7 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di "line": "Creating graphs for closest canonicals", "is_info": True }) + graphs_created = 0 for key, nucleotides in dict_of_canonicals.items(): for nucleotide_pair, cases in nucleotides.items(): @@ -98,19 +99,50 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di x_label=f"Number of errors (n of cases: {case_count})", y_label="Portion of reads", ) + graphs_created += 1 output_manager.output_line({ - "line": f"Graphs for closest canonicals created.", + "line": f"Done. A total of {graphs_created} graphs created for closest canonicals.", "is_info": True }) - def generate_indel_graphs(self): + def write_indel_results_to_file(self, indel_results): indel_results = self.compute_indel_results() + filepath = os.path.join(LOG_FILE_DIR, 'indel_results.log') + results = [] + total_reads_in_indel_results = 0 + for key, value in indel_results.items(): + results.append( + f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}\n") + output_manager.output_line({ + "line": f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}", + "is_info": True + }) + total_reads_in_indel_results += sum(value.values()) + + with open(filepath, "w", encoding="utf-8") as file: + file.writelines(results) + file.write( + f"Total number of reads in indel results: {total_reads_in_indel_results}\n") + output_manager.output_line({ + "line": f"Done. Indel-results written to: {filepath}", + "is_info": True + }) + + def validate_indel_grahp_should_be_created(self, parser_args, cases: dict): + if sum(cases.values()) < int(parser_args.min_reads_for_graph): + return False + return True + + def generate_indel_graphs(self, parser_args, indel_results): output_manager.output_line({ - "line": "Insertions and deletions found at given locations", + "line": "Generating graphs for insertions and deletions found at given locations", "is_info": True }) total_reads_in_indel_results = 0 + graphs_created = 0 for key, value in indel_results.items(): + if not self.validate_indel_grahp_should_be_created(parser_args, value): + continue title = f"Type: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}" filename = "indel." + str(key[0]) + ".strand_" + str(key[1]) + ".exon-loc-" + \ str(key[2]) + ".offset-(" + str(key[3]) + ")" @@ -126,12 +158,13 @@ def generate_indel_graphs(self): x_label=f"Number of errors (n of cases: {sum(value.values())})", y_label="Portion of reads", ) + graphs_created += 1 output_manager.output_line({ "line": f"Total number of reads in indel results: {total_reads_in_indel_results}", "is_info": True }) output_manager.output_line({ - "line": f"Graphs for {len(indel_results)} cases created.", + "line": f"Graphs for {graphs_created} cases created.", "is_info": True }) @@ -272,17 +305,22 @@ def generate_output_for_closest_canonicals(self, parser_args): self.write_closest_canonicals_log_to_file( parser_args, closest_canonicals_json_dict) + def generate_output_for_indels(self, parser_args): + indel_results = self.compute_indel_results() + self.generate_indel_graphs(parser_args, indel_results) + self.write_indel_results_to_file(indel_results) + def execute_log_file_creation(self, matching_cases_dict: dict, parser_args): output_manager.output_line({ "line": "Creating log-files", - "is_info": True + "is_title": True }) self.matching_cases_dict = matching_cases_dict self.generate_output_for_closest_canonicals(parser_args) - self.generate_indel_graphs() + self.generate_output_for_indels(parser_args) if parser_args.extended_debug: self.debug_logs["matching_cases_dict"] = self.matching_cases_dict From 7c9ebffa53aa21eb5dfc56b82ffcc14d8e5c594f Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:11:38 +0300 Subject: [PATCH 15/29] remove unneeded output --- src/services/log_manager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 9649527..b768ee0 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -113,10 +113,6 @@ def write_indel_results_to_file(self, indel_results): for key, value in indel_results.items(): results.append( f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}\n") - output_manager.output_line({ - "line": f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}", - "is_info": True - }) total_reads_in_indel_results += sum(value.values()) with open(filepath, "w", encoding="utf-8") as file: From f08f2e2ba4f791ad01548ec542b75574f79cb989 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:19:23 +0300 Subject: [PATCH 16/29] remove unneeded output --- src/services/log_manager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index b768ee0..a7b62c4 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -142,10 +142,6 @@ def generate_indel_graphs(self, parser_args, indel_results): title = f"Type: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}" filename = "indel." + str(key[0]) + ".strand_" + str(key[1]) + ".exon-loc-" + \ str(key[2]) + ".offset-(" + str(key[3]) + ")" - output_manager.output_line({ - "line": f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}", - "is_info": True - }) total_reads_in_indel_results += sum(value.values()) graph_manager.construct_bar_chart_from_dict( graph_values=value, From 0e95e47d6d3b7d82fbc026aef50d0b7aae5c707a Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:28:45 +0300 Subject: [PATCH 17/29] Remove unneeded debug stdout --- src/services/argument_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/services/argument_parser.py b/src/services/argument_parser.py index 5abed91..f898ef6 100644 --- a/src/services/argument_parser.py +++ b/src/services/argument_parser.py @@ -86,7 +86,6 @@ def init_argparser(): parser_dict["offset"] = (0, float('inf')) else: offset_range = [] - print(parser_args.offset) for element in parser_args.offset: if isinstance(element, str) and len(element) == 3: offset_range.append(float('inf')) From ff4f327bf9fee123242e20a86c714f0cfcd51963 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:28:58 +0300 Subject: [PATCH 18/29] Only print errors if there are errors --- src/services/log_manager.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index a7b62c4..8d553c5 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -35,10 +35,10 @@ def write_alignment_errors_to_file(self): def compute_indel_results(self): indel_results = {} - count_no_indel_errors = 0 + count_indel_errors = 0 for matching_case in self.matching_cases_dict.values(): if 'indel_errors' not in matching_case: - count_no_indel_errors += 1 + count_indel_errors += 1 self.alignment_erros.append(str(matching_case) + "\n") continue for indel_type, indel_dict in matching_case['indel_errors'].items(): @@ -53,10 +53,12 @@ def compute_indel_results(self): if indel_dict_key not in indel_results[key]: indel_results[key][indel_dict_key] = 0 indel_results[key][indel_dict_key] += indel_event_count - output_manager.output_line({ - "line": f"Number of cases without indel errors: {count_no_indel_errors}", - "is_error": True - }) + + if count_indel_errors: + output_manager.output_line({ + "line": f"Number of cases without indel errors: {count_indel_errors}", + "is_error": True + }) return indel_results def validate_img_should_be_created_for_closest_canonical_dict_entry(self, parser_args, key: tuple, nucleotide_pair: tuple, cases: dict): @@ -106,7 +108,6 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di }) def write_indel_results_to_file(self, indel_results): - indel_results = self.compute_indel_results() filepath = os.path.join(LOG_FILE_DIR, 'indel_results.log') results = [] total_reads_in_indel_results = 0 From dd98c024d518b1c337a95613d02fa0145b9adefa Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:32:11 +0300 Subject: [PATCH 19/29] Improve stdout clarity --- src/services/log_manager.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 8d553c5..cdf70f2 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -56,7 +56,7 @@ def compute_indel_results(self): if count_indel_errors: output_manager.output_line({ - "line": f"Number of cases without indel errors: {count_indel_errors}", + "line": f"Indel results: number of cases without indel errors: {count_indel_errors}", "is_error": True }) return indel_results @@ -74,7 +74,7 @@ def validate_img_should_be_created_for_closest_canonical_dict_entry(self, parser def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: dict): output_manager.output_line({ - "line": "Creating graphs for closest canonicals", + "line": "Closest canonicals: creating graphs.", "is_info": True }) graphs_created = 0 @@ -103,7 +103,7 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di ) graphs_created += 1 output_manager.output_line({ - "line": f"Done. A total of {graphs_created} graphs created for closest canonicals.", + "line": f"Closest canonicals: done. A total of {graphs_created} graphs created for closest canonicals.", "is_info": True }) @@ -119,9 +119,9 @@ def write_indel_results_to_file(self, indel_results): with open(filepath, "w", encoding="utf-8") as file: file.writelines(results) file.write( - f"Total number of reads in indel results: {total_reads_in_indel_results}\n") + f"Indel results: Total number of reads in indel results: {total_reads_in_indel_results}\n") output_manager.output_line({ - "line": f"Done. Indel-results written to: {filepath}", + "line": f"Indel results: done. Results written to: {filepath}", "is_info": True }) @@ -132,7 +132,7 @@ def validate_indel_grahp_should_be_created(self, parser_args, cases: dict): def generate_indel_graphs(self, parser_args, indel_results): output_manager.output_line({ - "line": "Generating graphs for insertions and deletions found at given locations", + "line": "Indel results: generating graphs for insertions and deletions found at given locations", "is_info": True }) total_reads_in_indel_results = 0 @@ -153,11 +153,11 @@ def generate_indel_graphs(self, parser_args, indel_results): ) graphs_created += 1 output_manager.output_line({ - "line": f"Total number of reads in indel results: {total_reads_in_indel_results}", + "line": f"Indel results: total number of reads in indel results: {total_reads_in_indel_results}", "is_info": True }) output_manager.output_line({ - "line": f"Graphs for {graphs_created} cases created.", + "line": f"Indel results: graphs for {graphs_created} cases created.", "is_info": True }) @@ -255,7 +255,7 @@ def write_closest_canonicals_log_to_file(self, parser_args, json_overview: dict) file.close() output_manager.output_line({ - "line": "Results written to file: " + FASTA_OVERVIEW_FILE, + "line": "Closest canonicals: results written to file: " + FASTA_OVERVIEW_FILE, "is_info": True }) @@ -306,7 +306,7 @@ def generate_output_for_indels(self, parser_args): def execute_log_file_creation(self, matching_cases_dict: dict, parser_args): output_manager.output_line({ - "line": "Creating log-files", + "line": "CREATING LOG-FILES", "is_title": True }) From 9bbcf796ba6a45538fd03bfde81f4c251521e612 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:41:26 +0300 Subject: [PATCH 20/29] Add warning to stdout --- src/services/output_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/services/output_manager.py b/src/services/output_manager.py index 4dbce95..e66cb0d 100644 --- a/src/services/output_manager.py +++ b/src/services/output_manager.py @@ -21,6 +21,7 @@ def output_line(self, config: dict): additional_line_breaks = config.get('additional_line_breaks', 0) is_info = config.get('is_info', False) is_error = config.get('is_error', False) + is_warning = config.get('is_warning', False) title_line_length = config.get('title_line_length', 50) save_to_log = config.get('save_to_log', True) if is_title: @@ -29,6 +30,8 @@ def output_line(self, config: dict): line = f"INFO: [{datetime.now().strftime('%Y-%m-%d %H:%M')}]: {line}" if is_error: line = f"ERROR: [{datetime.now().strftime('%Y-%m-%d %H:%M')}]: {line}" + if is_warning: + line = f"WARNING: [{datetime.now().strftime('%Y-%m-%d %H:%M')}]: {line}" print(line, end=end_line) if additional_line_breaks: print("\n" * additional_line_breaks, end="") From 276b3d7a3ab4c565bf140ce3615765f4482cbcf6 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:41:45 +0300 Subject: [PATCH 21/29] Change error to warning in stdout --- src/services/bam_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/services/bam_manager.py b/src/services/bam_manager.py index 8271987..be55310 100644 --- a/src/services/bam_manager.py +++ b/src/services/bam_manager.py @@ -99,11 +99,11 @@ def process_bam_file(self, reads_and_references: dict): output_manager.output_line({ "line": str(prev_processed_reads_counter) + " iterations extracted an already processed read from the BAM-file", - "is_error": True, + "is_warning": True, }) output_manager.output_line({ - "line": "\nFinished", + "line": "Processing BAM-file finished.", "is_info": True }) From ac527d3e2b006f8e427a5ed3553f8d2d8b8014a4 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 12:57:30 +0300 Subject: [PATCH 22/29] Improve stdout clarity --- src/services/log_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index cdf70f2..bc4ea35 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -116,12 +116,13 @@ def write_indel_results_to_file(self, indel_results): f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}\n") total_reads_in_indel_results += sum(value.values()) + summary_line = f"Indel results: Total number of reads in indel results -count: {total_reads_in_indel_results} (Note: one read can be related to multiple matching cases, or be related to multiple transcripts) \n" + with open(filepath, "w", encoding="utf-8") as file: file.writelines(results) - file.write( - f"Indel results: Total number of reads in indel results: {total_reads_in_indel_results}\n") + file.write(summary_line) output_manager.output_line({ - "line": f"Indel results: done. Results written to: {filepath}", + "line": summary_line, "is_info": True }) From 0158669b206b41502691ec764ac720a24a770d7f Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 13:08:35 +0300 Subject: [PATCH 23/29] Refactor stdout to match more with pipeline in documentation --- src/services/bam_manager.py | 6 +++--- src/services/extract_matching_cases.py | 15 +++++++++++++++ src/services/fasta_extractor.py | 8 ++++---- src/services/log_manager.py | 2 +- src/services/offset_computation.py | 11 +++++++---- src/tests/fasta_extractor_test.py | 2 +- 6 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/services/bam_manager.py b/src/services/bam_manager.py index be55310..6078119 100644 --- a/src/services/bam_manager.py +++ b/src/services/bam_manager.py @@ -109,7 +109,7 @@ def process_bam_file(self, reads_and_references: dict): def output_heading_information(self): output_manager.output_line({ - "line": "PROCESSING BAM-FILE", + "line": "ITERATE READS", "is_title": True }) output_manager.output_line({ @@ -157,12 +157,12 @@ def generate_dictionaries(self): log_manager.debug_logs['reads_and_references'] = reads_and_references output_manager.output_line({ - "line": "NUMBER OF MATCHING CASES:" + str(len(self.matching_cases_dict)), + "line": "Number of matching cases:" + str(len(self.matching_cases_dict)), "is_info": True }) output_manager.output_line({ - "line": "NUMBER OF READS: " + str(len(reads_and_references)), + "line": "Number of reads: " + str(len(reads_and_references)), "is_info": True }) diff --git a/src/services/extract_matching_cases.py b/src/services/extract_matching_cases.py index ee5d8b4..2606bff 100644 --- a/src/services/extract_matching_cases.py +++ b/src/services/extract_matching_cases.py @@ -1,3 +1,6 @@ +from services.output_manager import default_output_manager as output_manager + + class MatchingCasesExtractor: def __init__(self, offset_results: dict, offset_range: tuple, reference_db): @@ -15,6 +18,14 @@ def __init__(self, offset_results: dict, offset_range: tuple, reference_db): self.reference_db = reference_db def extract_candidates_matching_selected_offset(self) -> dict: + output_manager.output_line({ + "line": "EXTRACTING CASES", + "is_title": True + }) + output_manager.output_line({ + "line": f"Extracting candidates in the given offset range {self.offset_range}", + "is_info": True + }) extracted_candidates = {} for transcript_id, value in self.offset_results.items(): strand, offsets, reference_id = value["strand"], value["offsets"], value["reference_id"] @@ -51,4 +62,8 @@ def extract_candidates_matching_selected_offset(self) -> dict: "location": exon.end + offsets[offset_exon_idx][1], "offset": offsets[offset_exon_idx][1] } + output_manager.output_line({ + "line": f"Found {len(extracted_candidates)} candidates in the given offset range", + "is_info": True + }) return extracted_candidates diff --git a/src/services/fasta_extractor.py b/src/services/fasta_extractor.py index 69f8a56..4748c49 100644 --- a/src/services/fasta_extractor.py +++ b/src/services/fasta_extractor.py @@ -32,11 +32,11 @@ def extract_characters_at_given_coordinates(self, coordinates: tuple): def output_section_header(self): output_manager.output_line({ - "line": "FASTA EXTRACTION", + "line": "CLOSEST CANONICALS", "is_title": True }) output_manager.output_line({ - "line": "Fethching reference fasta file...", + "line": "Fethching reference FASTA-file.", "is_info": True }) @@ -44,7 +44,7 @@ def check_errors(self) -> bool: errors = False if not self.fasta: output_manager.output_line({ - "line": "fasta-file not found. Please check path and try again. Moving to next section.", + "line": "FASTA-file not found. Please check path and try again. Moving to next section.", "is_error": True }) errors = True @@ -122,6 +122,6 @@ def execute_fasta_extraction(self): self.iterate_matching_cases() output_manager.output_line({ - "line": "Phase finished.", + "line": "Finished.", "is_info": True }) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index bc4ea35..32571b8 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -307,7 +307,7 @@ def generate_output_for_indels(self, parser_args): def execute_log_file_creation(self, matching_cases_dict: dict, parser_args): output_manager.output_line({ - "line": "CREATING LOG-FILES", + "line": "CREATING LOG-FILES AND GRAPHS", "is_title": True }) diff --git a/src/services/offset_computation.py b/src/services/offset_computation.py index 4534f6a..22e6316 100644 --- a/src/services/offset_computation.py +++ b/src/services/offset_computation.py @@ -78,11 +78,11 @@ def fetch_exons(transcript, class_code, gffcompare_db, reference_db): def execute_offset_computation(class_code: str, gffcompare_db, reference_db, extended_debug: bool) -> dict: output_manager.output_line({ - "line": "ANNOTATION COMPARISON", + "line": "COMPUTE OFFSETS", "is_title": True }) output_manager.output_line({ - "line": "Counting offsets for class code instances for: " + class_code, + "line": "Computing offsets for class code instances for: " + class_code, "is_info": True }) @@ -107,10 +107,13 @@ def execute_offset_computation(class_code: str, gffcompare_db, reference_db, ext if extended_debug: log_manager.offset_results = offset_results_as_dict - + output_manager.output_line({ + "line": "Finished.", + "is_info": True + }) else: output_manager.output_line({ - "line": "offset computation finished. Hint: to output results into a file, enable extended debug.", + "line": "Offset computation finished. Hint: to output results into a file, enable extended debug.", "is_info": True }) return offset_results_as_dict diff --git a/src/tests/fasta_extractor_test.py b/src/tests/fasta_extractor_test.py index 26cefc5..8c8bde7 100644 --- a/src/tests/fasta_extractor_test.py +++ b/src/tests/fasta_extractor_test.py @@ -64,7 +64,7 @@ def test_an_empty_string_title_generates_a_line_of_selected_character(self): extractor = FastaExtractor(self.fasta_config) extractor.output_section_header() captured = self.capsys.readouterr() - assert 'FASTA EXTRACTION' in captured.out + assert 'CLOSEST CANONICALS' in captured.out def test_successful_execution_of_extraction_updates_dictionary(self): extractor = FastaExtractor(self.fasta_config) From 8106efb51b25cde45ad8f9df478a732d246ad02b Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 13:15:00 +0300 Subject: [PATCH 24/29] Improve stdout --- src/services/bam_manager.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/services/bam_manager.py b/src/services/bam_manager.py index 6078119..52b2ee2 100644 --- a/src/services/bam_manager.py +++ b/src/services/bam_manager.py @@ -27,6 +27,10 @@ def initialize_file(self, filename: str): self.samfile = pysam.AlignmentFile(filename, "rb") def process_bam_file(self, reads_and_references: dict): + output_manager.output_line({ + "line": "Iterating reads and counting indels. This may take a while.", + "is_info": True + }) count = 0 errors = [] set_of_processed_reads = set() @@ -156,18 +160,16 @@ def generate_dictionaries(self): log_manager.debug_logs['transcripts_and_reads'] = transcripts_and_reads log_manager.debug_logs['reads_and_references'] = reads_and_references - output_manager.output_line({ - "line": "Number of matching cases:" + str(len(self.matching_cases_dict)), - "is_info": True - }) + cases_line = f"Number of matching cases: {len(self.matching_cases_dict)}, " + \ + f"number of reads: {len(reads_and_references)}\n" output_manager.output_line({ - "line": "Number of reads: " + str(len(reads_and_references)), + "line": cases_line, "is_info": True }) output_manager.output_line({ - "line": "Analyzing offset of reads. This may take a while.", + "line": "Note: one read may be related to multiple matching cases.", "is_info": True }) From f77a8fb08ec2d9977b027605f5d3245a1bcc1038 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 13:20:44 +0300 Subject: [PATCH 25/29] Refactor based on pylint feedback --- src/services/log_manager.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 32571b8..fc2308d 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -61,7 +61,11 @@ def compute_indel_results(self): }) return indel_results - def validate_img_should_be_created_for_closest_canonical_dict_entry(self, parser_args, key: tuple, nucleotide_pair: tuple, cases: dict): + def validate_img_should_be_created_for_closest_canonical_dict_entry(self, + parser_args, + key: tuple, + nucleotide_pair: tuple, + cases: dict): if sum(cases.values()) < int(parser_args.min_reads_for_graph): return False acceptor_site_canonicals = ["AG", "AC"] @@ -87,7 +91,8 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di continue case_count = sum(cases.values()) - title = f"{nucleotide_pair}: strand: {key[0]}, exon location: {key[1]}, offset: {key[2]}, direction of pair: {key[3]}, n of cases: {case_count}" + title = f"{nucleotide_pair}: strand: {key[0]}, exon location: {key[1]}, offset: {key[2]}, " + \ + f"direction of pair: {key[3]}, n of cases: {case_count}" filename = "closest-canonicals." + ".strand_" + str(key[0]) + ".exon-loc-" + \ str(key[1]) + ".offset-(" + str(key[2]) + ")" + \ ".direction-" + \ @@ -113,10 +118,13 @@ def write_indel_results_to_file(self, indel_results): total_reads_in_indel_results = 0 for key, value in indel_results.items(): results.append( - f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}: {value}\n") + f"in/del: {key[0]}, strand: {key[1]}, exon location: {key[2]}, " + + f"offset: {key[3]}, n of cases: {sum(value.values())}: {value}\n") total_reads_in_indel_results += sum(value.values()) - summary_line = f"Indel results: Total number of reads in indel results -count: {total_reads_in_indel_results} (Note: one read can be related to multiple matching cases, or be related to multiple transcripts) \n" + summary_line = f"Indel results: count of reads in indel results: {total_reads_in_indel_results}" + \ + "(Note: one read can be related to multiple matching cases, " + \ + "or be related to multiple transcripts) \n" with open(filepath, "w", encoding="utf-8") as file: file.writelines(results) @@ -141,7 +149,8 @@ def generate_indel_graphs(self, parser_args, indel_results): for key, value in indel_results.items(): if not self.validate_indel_grahp_should_be_created(parser_args, value): continue - title = f"Type: {key[0]}, strand: {key[1]}, exon location: {key[2]}, offset: {key[3]}, n of cases: {sum(value.values())}" + title = f"Type: {key[0]}, strand: {key[1]}, exon location: {key[2]}, " + \ + f"offset: {key[3]}, n of cases: {sum(value.values())}" filename = "indel." + str(key[0]) + ".strand_" + str(key[1]) + ".exon-loc-" + \ str(key[2]) + ".offset-(" + str(key[3]) + ")" total_reads_in_indel_results += sum(value.values()) From bc9d34a41ab8cd6e606fc5a8defcb930240fc3e1 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 13:55:07 +0300 Subject: [PATCH 26/29] refactor code based on pylint feedback --- src/compana.py | 2 +- src/services/alignment_parser.py | 27 +++++++++------------- src/services/bam_manager.py | 20 +++++++---------- src/services/db_initializer.py | 6 +++-- src/services/extract_matching_cases.py | 16 ++++++++----- src/services/fasta_extractor.py | 14 +++++++----- src/services/graph_manager.py | 3 +-- src/services/log_manager.py | 31 ++++++++++++++++---------- src/services/offset_computation.py | 9 ++++++-- src/services/output_manager.py | 3 ++- src/tests/alignment_parser_test.py | 9 ++++---- src/tests/bam_manager_test.py | 6 ++--- 12 files changed, 77 insertions(+), 69 deletions(-) diff --git a/src/compana.py b/src/compana.py index 72c9eb8..4491c2c 100644 --- a/src/compana.py +++ b/src/compana.py @@ -49,7 +49,7 @@ def run_pipeline(parser_args): parser_args.reads_bam, parser_args.reads_tsv, matching_cases_dict) - bam_manager.execute(parser_args.window_size) + bam_manager.execute(int(parser_args.window_size)) log_manager.execute_log_file_creation(matching_cases_dict, parser_args) diff --git a/src/services/alignment_parser.py b/src/services/alignment_parser.py index 369c213..6f7abca 100644 --- a/src/services/alignment_parser.py +++ b/src/services/alignment_parser.py @@ -1,10 +1,3 @@ -import pysam - -from services.output_manager import default_output_manager as output_manager -from services.log_manager import default_log_manager as log_manager -from config import DEFAULT_WINDOW_SIZE - - class AlignmentParser: def __init__(self): @@ -13,9 +6,6 @@ def __init__(self): Follows the singleton pattern. """ - # TODO: add to configuration or to user arguments (window size) - self.window_size = int(DEFAULT_WINDOW_SIZE) - def extract_location_from_cigar_string(self, cigar_tuples: list, reference_start: int, @@ -25,11 +15,12 @@ def extract_location_from_cigar_string(self, alignment_position = 0 ref_position = 0 - for idx, cigar_code in enumerate(cigar_tuples): + for cigar_code in cigar_tuples: if cigar_code[0] in [0, 2, 3, 7, 8]: ref_position += cigar_code[1] - if ref_position <= relative_position and not reference_start + ref_position == reference_end: + if ref_position <= relative_position and not \ + reference_start + ref_position == reference_end: alignment_position += cigar_code[1] else: return alignment_position + (cigar_code[1] - (ref_position - relative_position)) @@ -39,7 +30,8 @@ def extract_location_from_cigar_string(self, def count_indels_from_cigar_codes_in_given_window(self, cigar_tuples: list, aligned_location: int, - loc_type: str): + loc_type: str, + window_size: int): """ Get cigar codes in a given window. @@ -62,16 +54,17 @@ def count_indels_from_cigar_codes_in_given_window(self, debug_list = [] if loc_type == "end": - aligned_location = aligned_location - self.window_size + 1 + aligned_location = aligned_location - window_size + 1 for cigar_code in cigar_tuples: - if self.window_size == len(cigar_code_list): + if window_size == len(cigar_code_list): break if location + cigar_code[1] > aligned_location: overlap = location + \ cigar_code[1] - (aligned_location + len(cigar_code_list)) cigar_code_list.extend( - [cigar_code[0] for _ in range(min(self.window_size - len(cigar_code_list), overlap))]) + [cigar_code[0] for _ in range(min(window_size - + len(cigar_code_list), overlap))]) location += cigar_code[1] debug_list.append(cigar_code_list) @@ -84,7 +77,7 @@ def count_indels_from_cigar_codes_in_given_window(self, result['deletions'] = deletions result['insertions'] = insertions - if deletions >= self.window_size or insertions >= self.window_size: + if deletions >= window_size or insertions >= window_size: debug_list.append( f"deletions: {deletions}, insertions: {insertions}") return True, debug_list, result diff --git a/src/services/bam_manager.py b/src/services/bam_manager.py index 52b2ee2..80ab5e6 100644 --- a/src/services/bam_manager.py +++ b/src/services/bam_manager.py @@ -1,4 +1,3 @@ -import os import pysam from services.read_management import create_dict_of_transcripts_and_reads @@ -6,8 +5,6 @@ from services.alignment_parser import default_alignment_parser as alignment_parser from services.log_manager import default_log_manager as log_manager -from config import LOG_FILE_DIR - class BamManager: @@ -15,18 +12,16 @@ def __init__(self, bam_path: str, tsv_path: str, matching_cases_dict: dict): + # pylint: disable=no-member self.bam_path = bam_path self.tsv_path = tsv_path + self.samfile = pysam.AlignmentFile(self.bam_path, "rb") self.matching_cases_dict = matching_cases_dict self.reads_and_transcripts = {} - def initialize_file(self, filename: str): - # pylint: disable=no-member - self.samfile = pysam.AlignmentFile(filename, "rb") - - def process_bam_file(self, reads_and_references: dict): + def process_bam_file(self, reads_and_references: dict, window_size: int): output_manager.output_line({ "line": "Iterating reads and counting indels. This may take a while.", "is_info": True @@ -65,7 +60,8 @@ def process_bam_file(self, reads_and_references: dict): idx_corrected_location = location - 1 - if read.reference_start > idx_corrected_location or read.reference_end < idx_corrected_location: + if read.reference_start > idx_corrected_location \ + or read.reference_end < idx_corrected_location: errors.append( f"Non-matching location: {read.query_name}, {matching_case_key}\t") continue @@ -87,7 +83,8 @@ def process_bam_file(self, reads_and_references: dict): error, debug_list, result = alignment_parser.count_indels_from_cigar_codes_in_given_window( read.cigartuples, aligned_location, - loc_type) + loc_type, + window_size) for key, value in result.items(): if value not in self.matching_cases_dict[matching_case_key]['indel_errors'][key]: self.matching_cases_dict[matching_case_key]['indel_errors'][key][value] = 0 @@ -181,5 +178,4 @@ def execute(self, window_size: int): reads_and_references = self.generate_dictionaries() - self.initialize_file(self.bam_path) - self.process_bam_file(reads_and_references) + self.process_bam_file(reads_and_references, window_size) diff --git a/src/services/db_initializer.py b/src/services/db_initializer.py index 4b903e6..c90107e 100644 --- a/src/services/db_initializer.py +++ b/src/services/db_initializer.py @@ -32,13 +32,15 @@ def init_databases(gffcompare_gtf: str, reference_gtf: str, force=False) -> tupl if not force and db_exists: output_manager.output_line({ - "line": f"{key}: database file already exists. Using existing database. Use -f to force overwriting existing db-files.", + "line": f"{key}: database file already exists. Using existing database. " + + "Use -f to force overwriting existing db-files.", "is_info": True }) else: output_manager.output_line({ - "line": f"{key}: database file does not exist. Creating database. This might take a while.", + "line": f"{key}: database file does not exist. " + + "Creating database. This might take a while.", "is_info": True }) gffutils.create_db( diff --git a/src/services/extract_matching_cases.py b/src/services/extract_matching_cases.py index 2606bff..c5426b2 100644 --- a/src/services/extract_matching_cases.py +++ b/src/services/extract_matching_cases.py @@ -5,8 +5,9 @@ class MatchingCasesExtractor: def __init__(self, offset_results: dict, offset_range: tuple, reference_db): """ - Extract candidates matching the selected offset. Currently first and last exon are not considered. - These are omitted with the magic numbers in the second for loop. + Extract candidates matching the selected offset. Currently first and last + exon are not considered. These are omitted with the magic numbers in the + second for loop. Args: offset_results (dict): a dictionary containing the offset results for each transcript @@ -30,13 +31,15 @@ def extract_candidates_matching_selected_offset(self) -> dict: for transcript_id, value in self.offset_results.items(): strand, offsets, reference_id = value["strand"], value["offsets"], value["reference_id"] for offset_exon_idx in range(1, len(offsets)-1): - start_is_in_range = bool(abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and - abs(offsets[offset_exon_idx][0]) <= self.offset_range[1]) + start_is_in_range = bool( + abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and + abs(offsets[offset_exon_idx][0]) <= self.offset_range[1]) if start_is_in_range: entry_key = transcript_id + ".exon_" + \ str(offset_exon_idx) + ".start" + \ '.offset_' + str(offsets[offset_exon_idx][0]) - for exon_number, exon in enumerate(self.reference_db.children(reference_id, featuretype='exon', order_by='start')): + for exon_number, exon in enumerate(self.reference_db.children( + reference_id, featuretype='exon', order_by='start')): if exon_number == offset_exon_idx: extracted_candidates[entry_key] = { "transcript_id": transcript_id, @@ -52,7 +55,8 @@ def extract_candidates_matching_selected_offset(self) -> dict: entry_key = transcript_id + ".exon_" + \ str(offset_exon_idx) + ".end" + \ '.offset_' + str(offsets[offset_exon_idx][1]) - for exon_number, exon in enumerate(self.reference_db.children(reference_id, featuretype='exon', order_by='start')): + for exon_number, exon in enumerate(self.reference_db.children( + reference_id, featuretype='exon', order_by='start')): if exon_number == offset_exon_idx: extracted_candidates[entry_key] = { "transcript_id": transcript_id, diff --git a/src/services/fasta_extractor.py b/src/services/fasta_extractor.py index 4748c49..52fd131 100644 --- a/src/services/fasta_extractor.py +++ b/src/services/fasta_extractor.py @@ -1,7 +1,6 @@ -import json from pyfaidx import Fasta from services.output_manager import default_output_manager as output_manager -from config import FASTA_OVERVIEW_FILE, DEFAULT_WINDOW_SIZE +from config import DEFAULT_WINDOW_SIZE class FastaExtractor: @@ -44,7 +43,8 @@ def check_errors(self) -> bool: errors = False if not self.fasta: output_manager.output_line({ - "line": "FASTA-file not found. Please check path and try again. Moving to next section.", + "line": "FASTA-file not found. Please check path and try again. " + + "Moving to next section.", "is_error": True }) errors = True @@ -98,8 +98,9 @@ def iterate_matching_cases(self): splice_cite_location = value["location"] - 2 else: splice_cite_location = value["location"] + 1 - coordinates = (key.split('.')[ - 1], splice_cite_location - self.window_size, splice_cite_location + self.window_size) + coordinates = (key.split('.')[1], + splice_cite_location - self.window_size, + splice_cite_location + self.window_size) nucleotides = self.extract_characters_at_given_coordinates( coordinates) if value["location_type"] == "start": @@ -115,7 +116,8 @@ def execute_fasta_extraction(self): return output_manager.output_line({ - "line": f"Searching closest possible canonical sites for provided offset range: {self.offset}", + "line": "Searching closest possible canonical " + + f"sites for provided offset range: {self.offset}", "is_info": True }) diff --git a/src/services/graph_manager.py b/src/services/graph_manager.py index 519cb58..3c3dfe5 100644 --- a/src/services/graph_manager.py +++ b/src/services/graph_manager.py @@ -1,6 +1,5 @@ import os import matplotlib.pyplot as plt -import numpy as np from config import LOG_FILE_DIR @@ -21,7 +20,7 @@ def construct_bar_chart_from_dict(self, title: str, x_label: str, y_label: str): - if not len(graph_values): + if not graph_values: return normalized_graph_values = self.normalize_values(graph_values) values = sorted(normalized_graph_values.items()) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index fc2308d..39a000e 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -22,7 +22,8 @@ def write_offset_results_to_file(self): "transcript_id\treference_id\tclass_code\tstrand\toffsets\n") for key, value in self.offset_results.items(): file.write( - f"{key}\t{value['reference_id']}\t{value['class_code']}\t{value['strand']}\t{value['offsets']}\n") + f"{key}\t{value['reference_id']}\t{value['class_code']}\t" + + f"{value['strand']}\t{value['offsets']}\n") def write_alignment_errors_to_file(self): @@ -43,12 +44,11 @@ def compute_indel_results(self): continue for indel_type, indel_dict in matching_case['indel_errors'].items(): key = ( - indel_type, matching_case['strand'], matching_case['location_type'], matching_case['offset']) + indel_type, matching_case['strand'], + matching_case['location_type'], + matching_case['offset']) if key not in indel_results: indel_results[key] = {} - # TODO: A more pythonic way but harder to read? - # indel_results[key] = {k: indel_results[key].get(k, 0) + indel_dict.get(k, 0) - # for k in set(list(indel_results[key].keys()) + list(indel_dict.keys()))} for indel_dict_key, indel_event_count in indel_dict.items(): if indel_dict_key not in indel_results[key]: indel_results[key][indel_dict_key] = 0 @@ -56,7 +56,8 @@ def compute_indel_results(self): if count_indel_errors: output_manager.output_line({ - "line": f"Indel results: number of cases without indel errors: {count_indel_errors}", + "line": "Indel results: number of cases without indel errors: " + + f"{count_indel_errors}", "is_error": True }) return indel_results @@ -91,7 +92,8 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di continue case_count = sum(cases.values()) - title = f"{nucleotide_pair}: strand: {key[0]}, exon location: {key[1]}, offset: {key[2]}, " + \ + title = f"{nucleotide_pair}: strand: {key[0]}, exon location: {key[1]}, " + \ + f"offset: {key[2]}, " + \ f"direction of pair: {key[3]}, n of cases: {case_count}" filename = "closest-canonicals." + ".strand_" + str(key[0]) + ".exon-loc-" + \ str(key[1]) + ".offset-(" + str(key[2]) + ")" + \ @@ -108,7 +110,8 @@ def generate_closest_canonicals_graphs(self, parser_args, dict_of_canonicals: di ) graphs_created += 1 output_manager.output_line({ - "line": f"Closest canonicals: done. A total of {graphs_created} graphs created for closest canonicals.", + "line": f"Closest canonicals: done. A total of {graphs_created} " + + "graphs created for closest canonicals.", "is_info": True }) @@ -122,7 +125,8 @@ def write_indel_results_to_file(self, indel_results): f"offset: {key[3]}, n of cases: {sum(value.values())}: {value}\n") total_reads_in_indel_results += sum(value.values()) - summary_line = f"Indel results: count of reads in indel results: {total_reads_in_indel_results}" + \ + summary_line = "Indel results: count of reads in indel results: " + \ + f"{total_reads_in_indel_results}" + \ "(Note: one read can be related to multiple matching cases, " + \ "or be related to multiple transcripts) \n" @@ -141,7 +145,8 @@ def validate_indel_grahp_should_be_created(self, parser_args, cases: dict): def generate_indel_graphs(self, parser_args, indel_results): output_manager.output_line({ - "line": "Indel results: generating graphs for insertions and deletions found at given locations", + "line": "Indel results: generating graphs for insertions and deletions " + + "found at given locations", "is_info": True }) total_reads_in_indel_results = 0 @@ -163,7 +168,8 @@ def generate_indel_graphs(self, parser_args, indel_results): ) graphs_created += 1 output_manager.output_line({ - "line": f"Indel results: total number of reads in indel results: {total_reads_in_indel_results}", + "line": "Indel results: total number of reads in indel results: " + + f"{total_reads_in_indel_results}", "is_info": True }) output_manager.output_line({ @@ -245,7 +251,8 @@ def write_closest_canonicals_log_to_file(self, parser_args, json_overview: dict) file.write( "This section contains the results in table-format. ") file.write( - "To disable this section, set extended debug 'false' in provided arguments. \n") + "To disable this section, set extended debug 'false' " + + "in provided arguments. \n") current_chromosome = "chr1" file.write(f'\n### {current_chromosome}\n') file.write( diff --git a/src/services/offset_computation.py b/src/services/offset_computation.py index 22e6316..7004aaa 100644 --- a/src/services/offset_computation.py +++ b/src/services/offset_computation.py @@ -76,7 +76,11 @@ def fetch_exons(transcript, class_code, gffcompare_db, reference_db): return aligned_exons, reference_exons -def execute_offset_computation(class_code: str, gffcompare_db, reference_db, extended_debug: bool) -> dict: +def execute_offset_computation( + class_code: str, + gffcompare_db, + reference_db, + extended_debug: bool) -> dict: output_manager.output_line({ "line": "COMPUTE OFFSETS", "is_title": True @@ -113,7 +117,8 @@ def execute_offset_computation(class_code: str, gffcompare_db, reference_db, ext }) else: output_manager.output_line({ - "line": "Offset computation finished. Hint: to output results into a file, enable extended debug.", + "line": "Offset computation finished. " + + "Hint: to output results into a file, enable extended debug.", "is_info": True }) return offset_results_as_dict diff --git a/src/services/output_manager.py b/src/services/output_manager.py index e66cb0d..445c2bf 100644 --- a/src/services/output_manager.py +++ b/src/services/output_manager.py @@ -9,7 +9,8 @@ def __init__(self): self.log_output = [] def output_line(self, config: dict): - """Outputs given line to console. Can be used to output titles, info, errors and more. A simple custom logger. + """Outputs given line to console. Can be used to output titles, + info, warnings, errors and more. A simple custom logger. Args: config (dict): Configuration options for the output line. diff --git a/src/tests/alignment_parser_test.py b/src/tests/alignment_parser_test.py index 0120478..274e467 100644 --- a/src/tests/alignment_parser_test.py +++ b/src/tests/alignment_parser_test.py @@ -127,6 +127,7 @@ def test_case_that_has_it_s_location_at_final_match_returns_correct_value(self): class TestIndelCountingFromCigarCodes(TestCase): def setUp(self): self.parser = AlignmentParser() + self.window_size = 8 def test_indel_counter_returns_false_and_an_empty_debug_list_for_given_empty_list(self): cigar_tuples = [] @@ -136,13 +137,12 @@ def test_indel_counter_returns_false_and_an_empty_debug_list_for_given_empty_lis expected_result, expected_debug_list, expected_result = False, [ []], {'deletions': 0, 'insertions': 0} result, debug_list, result = alignment_parser.count_indels_from_cigar_codes_in_given_window( - cigar_tuples, aligned_location, loc_type) + cigar_tuples, aligned_location, loc_type, self.window_size) self.assertEqual((result, debug_list, result), (expected_result, expected_debug_list, expected_result)) def test_indels_are_counted_correctly(self): cigar_tuples = [(0, 20), (2, 3), (1, 2), (0, 10)] - self.parser.window_size = 8 aligned_location = 27 loc_type = "end" strand = "+" @@ -150,13 +150,12 @@ def test_indels_are_counted_correctly(self): 2, 2, 2, 1, 1, 0, 0, 0], {'deletions': 3, 'insertions': 2} errors, debug_list, result = alignment_parser.count_indels_from_cigar_codes_in_given_window( - cigar_tuples, aligned_location, loc_type) + cigar_tuples, aligned_location, loc_type, self.window_size) self.assertEqual((errors, debug_list[0], result), (expected_errors, expected_debug_list, expected_result)) def test_full_window_of_dels_returns_true_for_errors(self): cigar_tuples = [(0, 20), (2, 8), (1, 2), (0, 10)] - self.parser.window_size = 8 aligned_location = 20 loc_type = "start" strand = "+" @@ -164,6 +163,6 @@ def test_full_window_of_dels_returns_true_for_errors(self): 2, 2, 2, 2, 2, 2, 2, 2], {'deletions': 8, 'insertions': 0} errors, debug_list, result = alignment_parser.count_indels_from_cigar_codes_in_given_window( - cigar_tuples, aligned_location, loc_type) + cigar_tuples, aligned_location, loc_type, self.window_size) self.assertEqual((errors, debug_list[0], result), (expected_errors, expected_debug_list, expected_result)) diff --git a/src/tests/bam_manager_test.py b/src/tests/bam_manager_test.py index dfb4152..7572f55 100644 --- a/src/tests/bam_manager_test.py +++ b/src/tests/bam_manager_test.py @@ -71,10 +71,9 @@ def test_bam_manager_execute_runs_without_errors(self): class TestBamReader(TestCase): def setUp(self): - self.bam_manager = BamManager("", "", {}) test_bam_file = os.path.join( TEST_FILE_DIR, "Mouse.ONT.R9.4.sim.RE.no_gtf.transcript925.ch1.nnic.bam") - self.bam_manager.initialize_file(test_bam_file) + self.bam_manager = BamManager(test_bam_file, "", {}) def test_bam_reader_runs_without_errors(self): reads_and_references = { @@ -89,4 +88,5 @@ def test_bam_reader_runs_without_errors(self): } } - self.bam_manager.process_bam_file(reads_and_references) + window_size = 10 + self.bam_manager.process_bam_file(reads_and_references, window_size) From d9d8b04244d3408dcf1d223f39c5611bb9a093d1 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 14:01:03 +0300 Subject: [PATCH 27/29] Fix issue with extra line-break in stdout --- src/services/bam_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/services/bam_manager.py b/src/services/bam_manager.py index 80ab5e6..b7a7659 100644 --- a/src/services/bam_manager.py +++ b/src/services/bam_manager.py @@ -158,7 +158,7 @@ def generate_dictionaries(self): log_manager.debug_logs['reads_and_references'] = reads_and_references cases_line = f"Number of matching cases: {len(self.matching_cases_dict)}, " + \ - f"number of reads: {len(reads_and_references)}\n" + f"number of reads: {len(reads_and_references)}" output_manager.output_line({ "line": cases_line, From 1c4fe03df6441b8f23564addbd0e464c1501d909 Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 14:04:13 +0300 Subject: [PATCH 28/29] remove indel results being printed twice --- src/services/log_manager.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index 39a000e..caeafb9 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -126,9 +126,9 @@ def write_indel_results_to_file(self, indel_results): total_reads_in_indel_results += sum(value.values()) summary_line = "Indel results: count of reads in indel results: " + \ - f"{total_reads_in_indel_results}" + \ + f"{total_reads_in_indel_results} " + \ "(Note: one read can be related to multiple matching cases, " + \ - "or be related to multiple transcripts) \n" + "or be related to multiple transcripts). \n" with open(filepath, "w", encoding="utf-8") as file: file.writelines(results) @@ -149,7 +149,6 @@ def generate_indel_graphs(self, parser_args, indel_results): "found at given locations", "is_info": True }) - total_reads_in_indel_results = 0 graphs_created = 0 for key, value in indel_results.items(): if not self.validate_indel_grahp_should_be_created(parser_args, value): @@ -158,7 +157,7 @@ def generate_indel_graphs(self, parser_args, indel_results): f"offset: {key[3]}, n of cases: {sum(value.values())}" filename = "indel." + str(key[0]) + ".strand_" + str(key[1]) + ".exon-loc-" + \ str(key[2]) + ".offset-(" + str(key[3]) + ")" - total_reads_in_indel_results += sum(value.values()) + graph_manager.construct_bar_chart_from_dict( graph_values=value, filename=filename, @@ -167,11 +166,7 @@ def generate_indel_graphs(self, parser_args, indel_results): y_label="Portion of reads", ) graphs_created += 1 - output_manager.output_line({ - "line": "Indel results: total number of reads in indel results: " + - f"{total_reads_in_indel_results}", - "is_info": True - }) + output_manager.output_line({ "line": f"Indel results: graphs for {graphs_created} cases created.", "is_info": True From 1929ebac693c66f4a4d0c91d18ebd7f093a1b04e Mon Sep 17 00:00:00 2001 From: Heidi Holappa Date: Thu, 6 Jul 2023 14:21:23 +0300 Subject: [PATCH 29/29] remove extra line-break --- src/services/log_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/services/log_manager.py b/src/services/log_manager.py index caeafb9..4a55a81 100644 --- a/src/services/log_manager.py +++ b/src/services/log_manager.py @@ -128,7 +128,7 @@ def write_indel_results_to_file(self, indel_results): summary_line = "Indel results: count of reads in indel results: " + \ f"{total_reads_in_indel_results} " + \ "(Note: one read can be related to multiple matching cases, " + \ - "or be related to multiple transcripts). \n" + "or be related to multiple transcripts)." with open(filepath, "w", encoding="utf-8") as file: file.writelines(results)