Merge pull request #169 from heidi-holappa/development

Merge to main
heidi-holappa · Jul 6, 2023 · 2944ba1 · 2944ba1
2 parents 138bee4 + c31f8f8
commit 2944ba1
Show file tree

Hide file tree

Showing 17 changed files with 231 additions and 139 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ offsets.txt
 htmlcov/
 logs/
 dx1-logs/
+dx3-logs/
diff --git a/arguments_template.json b/arguments_template.json
@@ -4,9 +4,10 @@
     "reference_gtf": "<file.gtf>",
     "reference_fasta": "<file.fa>",
     "gffcompare_gtf": "<file.gtf>",
-    "offset": 0,
-    "class_code": "j c k",
-    "force": false,
+    "offset": [0, 6],
+    "class_code": "j c k s x",
     "window_size": 8,
+    "min_reads_for_graph": 100,
+    "force": false,
     "extended_debugging": false
 }
diff --git a/src/compana.py b/src/compana.py
@@ -49,7 +49,7 @@ def run_pipeline(parser_args):
             parser_args.reads_bam,
             parser_args.reads_tsv,
             matching_cases_dict)
-        bam_manager.execute(parser_args.window_size)
+        bam_manager.execute(int(parser_args.window_size))
 
     log_manager.execute_log_file_creation(matching_cases_dict, parser_args)
 

diff --git a/src/config.py b/src/config.py
@@ -10,7 +10,7 @@
     print("Warning! .env file not found.")
 
 temporary_dir_path = os.getenv("TEMPORARY_DIR") or "temporary_files"
-offset_log = os.getenv("OFFSET_LOG") or "offset_log.txt"
+offset_log = os.getenv("OFFSET_LOG_FILE") or "debug_offsets.log"
 test_file_directory = os.getenv(
     "TEST_FILE_DIRECTORY") or "src/tests/files"
 
@@ -30,6 +30,7 @@
 CIGAR_RESULTS_LOG = os.path.join(LOG_FILE_DIR, cigar_results)
 TITLE_FILE_LENGTH = os.getenv("TITLE_FILE_LENGTH") or 50
 DEFAULT_WINDOW_SIZE = os.getenv("DEFAULT_WINDOW_SIZE") or 8
+CREATE_IMG_N_TRESHOLD = os.getenv("CREATE_IMG_N_TRESHOLD") or 100
 
 if not os.path.exists(LOG_DIR):
     os.mkdir(LOG_DIR)

diff --git a/src/services/alignment_parser.py b/src/services/alignment_parser.py
@@ -1,10 +1,3 @@
-import pysam
-
-from services.output_manager import default_output_manager as output_manager
-from services.log_manager import default_log_manager as log_manager
-from config import DEFAULT_WINDOW_SIZE
-
-
 class AlignmentParser:
 
     def __init__(self):
@@ -13,9 +6,6 @@ def __init__(self):
             Follows the singleton pattern. 
         """
 
-        # TODO: add to configuration or to user arguments (window size)
-        self.window_size = int(DEFAULT_WINDOW_SIZE)
-
     def extract_location_from_cigar_string(self,
                                            cigar_tuples: list,
                                            reference_start: int,
@@ -25,11 +15,12 @@ def extract_location_from_cigar_string(self,
         alignment_position = 0
         ref_position = 0
 
-        for idx, cigar_code in enumerate(cigar_tuples):
+        for cigar_code in cigar_tuples:
 
             if cigar_code[0] in [0, 2, 3, 7, 8]:
                 ref_position += cigar_code[1]
-            if ref_position <= relative_position and not reference_start + ref_position == reference_end:
+            if ref_position <= relative_position and not \
+                    reference_start + ref_position == reference_end:
                 alignment_position += cigar_code[1]
             else:
                 return alignment_position + (cigar_code[1] - (ref_position - relative_position))
@@ -39,7 +30,8 @@ def extract_location_from_cigar_string(self,
     def count_indels_from_cigar_codes_in_given_window(self,
                                                       cigar_tuples: list,
                                                       aligned_location: int,
-                                                      loc_type: str):
+                                                      loc_type: str,
+                                                      window_size: int):
         """
         Get cigar codes in a given window.
 
@@ -62,16 +54,17 @@ def count_indels_from_cigar_codes_in_given_window(self,
         debug_list = []
 
         if loc_type == "end":
-            aligned_location = aligned_location - self.window_size + 1
+            aligned_location = aligned_location - window_size + 1
 
         for cigar_code in cigar_tuples:
-            if self.window_size == len(cigar_code_list):
+            if window_size == len(cigar_code_list):
                 break
             if location + cigar_code[1] > aligned_location:
                 overlap = location + \
                     cigar_code[1] - (aligned_location + len(cigar_code_list))
                 cigar_code_list.extend(
-                    [cigar_code[0] for _ in range(min(self.window_size - len(cigar_code_list), overlap))])
+                    [cigar_code[0] for _ in range(min(window_size -
+                                                      len(cigar_code_list), overlap))])
             location += cigar_code[1]
 
         debug_list.append(cigar_code_list)
@@ -84,7 +77,7 @@ def count_indels_from_cigar_codes_in_given_window(self,
         result['deletions'] = deletions
         result['insertions'] = insertions
 
-        if deletions >= self.window_size or insertions >= self.window_size:
+        if deletions >= window_size or insertions >= window_size:
             debug_list.append(
                 f"deletions: {deletions}, insertions: {insertions}")
             return True, debug_list, result

diff --git a/src/services/argument_parser.py b/src/services/argument_parser.py
@@ -2,12 +2,13 @@
 import json
 
 from config import DEFAULT_WINDOW_SIZE
+from config import CREATE_IMG_N_TRESHOLD
 
 
 def init_argparser():
     parser = argparse.ArgumentParser(
         description='compAna: a tool for comparing annotations',
-        usage='python3 compAna.py -i <input.gtf> [-f] [-s]'
+        usage='python3 src/compana.py -g <gffcompare_gtf> -r <reference_gtf> -a <reference_fasta> [-o <offset>] [-f] [-s] [-c <class_code>] [-j <json>] [-b <reads_bam>] [-t <reads_tsv>] [-w <window_size>] [-e]'
     )
     parser.add_argument(
         '-g', '--gffcompare_gtf',
@@ -56,11 +57,16 @@ def init_argparser():
     parser.add_argument(
         '-w', '--window_size',
         help='window from which indels and mismatches are to be searched in interesting locations',
-        metavar='')
+        metavar='',
+        default=DEFAULT_WINDOW_SIZE)
     parser.add_argument(
         '-e', '--extended_debug',
         help='enable extended debug output',
         action='store_true')
+    parser.add_argument(
+        '-m', '--min_reads_for_graph',
+        help='threshold for the n of cases for creating images',
+        default=CREATE_IMG_N_TRESHOLD)
 
     parser_args = parser.parse_args()
     parser_dict = vars(parser_args)
@@ -80,7 +86,6 @@ def init_argparser():
         parser_dict["offset"] = (0, float('inf'))
     else:
         offset_range = []
-        print(parser_args.offset)
         for element in parser_args.offset:
             if isinstance(element, str) and len(element) == 3:
                 offset_range.append(float('inf'))
@@ -90,7 +95,4 @@ def init_argparser():
             max(0, offset_range[0]), max(0, offset_range[1])
         )
 
-    if not parser_args.window_size:
-        parser_dict["window_size"] = DEFAULT_WINDOW_SIZE
-
     return parser_args
diff --git a/src/services/bam_manager.py b/src/services/bam_manager.py
@@ -1,32 +1,31 @@
-import os
 import pysam
 
 from services.read_management import create_dict_of_transcripts_and_reads
 from services.output_manager import default_output_manager as output_manager
 from services.alignment_parser import default_alignment_parser as alignment_parser
 from services.log_manager import default_log_manager as log_manager
 
-from config import LOG_FILE_DIR
-
 
 class BamManager:
 
     def __init__(self,
                  bam_path: str,
                  tsv_path: str,
                  matching_cases_dict: dict):
+        # pylint: disable=no-member
 
         self.bam_path = bam_path
         self.tsv_path = tsv_path
+        self.samfile = pysam.AlignmentFile(self.bam_path, "rb")
 
         self.matching_cases_dict = matching_cases_dict
         self.reads_and_transcripts = {}
 
-    def initialize_file(self, filename: str):
-        # pylint: disable=no-member
-        self.samfile = pysam.AlignmentFile(filename, "rb")
-
-    def process_bam_file(self, reads_and_references: dict):
+    def process_bam_file(self, reads_and_references: dict, window_size: int):
+        output_manager.output_line({
+            "line": "Iterating reads and counting indels. This may take a while.",
+            "is_info": True
+        })
         count = 0
         errors = []
         set_of_processed_reads = set()
@@ -61,7 +60,8 @@ def process_bam_file(self, reads_and_references: dict):
 
                     idx_corrected_location = location - 1
 
-                    if read.reference_start > idx_corrected_location or read.reference_end < idx_corrected_location:
+                    if read.reference_start > idx_corrected_location \
+                            or read.reference_end < idx_corrected_location:
                         errors.append(
                             f"Non-matching location: {read.query_name}, {matching_case_key}\t")
                         continue
@@ -83,7 +83,8 @@ def process_bam_file(self, reads_and_references: dict):
                     error, debug_list, result = alignment_parser.count_indels_from_cigar_codes_in_given_window(
                         read.cigartuples,
                         aligned_location,
-                        loc_type)
+                        loc_type,
+                        window_size)
                     for key, value in result.items():
                         if value not in self.matching_cases_dict[matching_case_key]['indel_errors'][key]:
                             self.matching_cases_dict[matching_case_key]['indel_errors'][key][value] = 0
@@ -99,17 +100,17 @@ def process_bam_file(self, reads_and_references: dict):
             output_manager.output_line({
                 "line": str(prev_processed_reads_counter) +
                 " iterations extracted an already processed read from the BAM-file",
-                "is_error": True,
+                "is_warning": True,
             })
 
         output_manager.output_line({
-            "line": "\nFinished",
+            "line": "Processing BAM-file finished.",
             "is_info": True
         })
 
     def output_heading_information(self):
         output_manager.output_line({
-            "line": "PROCESSING BAM-FILE",
+            "line": "ITERATE READS",
             "is_title": True
         })
         output_manager.output_line({
@@ -156,18 +157,16 @@ def generate_dictionaries(self):
         log_manager.debug_logs['transcripts_and_reads'] = transcripts_and_reads
         log_manager.debug_logs['reads_and_references'] = reads_and_references
 
-        output_manager.output_line({
-            "line": "NUMBER OF MATCHING CASES:" + str(len(self.matching_cases_dict)),
-            "is_info": True
-        })
+        cases_line = f"Number of matching cases: {len(self.matching_cases_dict)}, " + \
+            f"number of reads: {len(reads_and_references)}"
 
         output_manager.output_line({
-            "line": "NUMBER OF READS: " + str(len(reads_and_references)),
+            "line": cases_line,
             "is_info": True
         })
 
         output_manager.output_line({
-            "line": "Analyzing offset of reads. This may take a while.",
+            "line": "Note: one read may be related to multiple matching cases.",
             "is_info": True
         })
 
@@ -179,5 +178,4 @@ def execute(self, window_size: int):
 
         reads_and_references = self.generate_dictionaries()
 
-        self.initialize_file(self.bam_path)
-        self.process_bam_file(reads_and_references)
+        self.process_bam_file(reads_and_references, window_size)
diff --git a/src/services/db_initializer.py b/src/services/db_initializer.py
@@ -32,13 +32,15 @@ def init_databases(gffcompare_gtf: str, reference_gtf: str, force=False) -> tupl
 
         if not force and db_exists:
             output_manager.output_line({
-                "line": f"{key}: database file already exists. Using existing database. Use -f to force overwriting existing db-files.",
+                "line": f"{key}: database file already exists. Using existing database. " +
+                "Use -f to force overwriting existing db-files.",
                 "is_info": True
             })
 
         else:
             output_manager.output_line({
-                "line": f"{key}: database file does not exist. Creating database. This might take a while.",
+                "line": f"{key}: database file does not exist. " +
+                "Creating database. This might take a while.",
                 "is_info": True
             })
             gffutils.create_db(

diff --git a/src/services/extract_matching_cases.py b/src/services/extract_matching_cases.py
@@ -1,8 +1,13 @@
+from services.output_manager import default_output_manager as output_manager
+
+
 class MatchingCasesExtractor:
 
     def __init__(self, offset_results: dict, offset_range: tuple, reference_db):
         """
-            Extract candidates matching the selected offset.
+            Extract candidates matching the selected offset. Currently first and last 
+            exon are not considered. These are omitted with the magic numbers in the 
+            second for loop.
 
         Args:
             offset_results (dict): a dictionary containing the offset results for each transcript
@@ -14,42 +19,55 @@ def __init__(self, offset_results: dict, offset_range: tuple, reference_db):
         self.reference_db = reference_db
 
     def extract_candidates_matching_selected_offset(self) -> dict:
+        output_manager.output_line({
+            "line": "EXTRACTING CASES",
+            "is_title": True
+        })
+        output_manager.output_line({
+            "line": f"Extracting candidates in the given offset range {self.offset_range}",
+            "is_info": True
+        })
         extracted_candidates = {}
-        for key, value in self.offset_results.items():
-            strand, offsets = value["strand"], value["offsets"]
-            reference_id, transcript_id = value["reference_id"], key
-            if strand == "-":
-                offsets.reverse()
+        for transcript_id, value in self.offset_results.items():
+            strand, offsets, reference_id = value["strand"], value["offsets"], value["reference_id"]
             for offset_exon_idx in range(1, len(offsets)-1):
-                offset_exon_number = offset_exon_idx + 1
-                if abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and \
-                        abs(offsets[offset_exon_idx][0]) <= self.offset_range[1]:
+                start_is_in_range = bool(
+                    abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and
+                    abs(offsets[offset_exon_idx][0]) <= self.offset_range[1])
+                if start_is_in_range:
                     entry_key = transcript_id + ".exon_" + \
-                        str(offset_exon_number) + ".start" + \
+                        str(offset_exon_idx) + ".start" + \
                         '.offset_' + str(offsets[offset_exon_idx][0])
-                    for exon in self.reference_db.children(reference_id, featuretype='exon'):
-                        if int(exon['exon_number'][0]) == offset_exon_number:
+                    for exon_number, exon in enumerate(self.reference_db.children(
+                            reference_id, featuretype='exon', order_by='start')):
+                        if exon_number == offset_exon_idx:
                             extracted_candidates[entry_key] = {
                                 "transcript_id": transcript_id,
                                 "strand": strand,
-                                "exon_number": offset_exon_number,
+                                "exon_number": offset_exon_idx,
                                 "location_type": "start",
                                 "location": exon.start + offsets[offset_exon_idx][0],
                                 "offset": offsets[offset_exon_idx][0]
                             }
-                if abs(offsets[offset_exon_idx][1]) >= self.offset_range[0] and \
-                        abs(offsets[offset_exon_idx][1]) <= self.offset_range[1]:
+                end_is_in_range = bool(abs(offsets[offset_exon_idx][1]) >= self.offset_range[0] and
+                                       abs(offsets[offset_exon_idx][1]) <= self.offset_range[1])
+                if end_is_in_range:
                     entry_key = transcript_id + ".exon_" + \
-                        str(offset_exon_number) + ".end" + \
+                        str(offset_exon_idx) + ".end" + \
                         '.offset_' + str(offsets[offset_exon_idx][1])
-                    for exon in self.reference_db.children(reference_id, featuretype='exon'):
-                        if int(exon['exon_number'][0]) == offset_exon_number:
+                    for exon_number, exon in enumerate(self.reference_db.children(
+                            reference_id, featuretype='exon', order_by='start')):
+                        if exon_number == offset_exon_idx:
                             extracted_candidates[entry_key] = {
                                 "transcript_id": transcript_id,
                                 "strand": strand,
-                                "exon_number": offset_exon_number,
+                                "exon_number": offset_exon_idx,
                                 "location_type": "end",
                                 "location": exon.end + offsets[offset_exon_idx][1],
                                 "offset": offsets[offset_exon_idx][1]
                             }
+        output_manager.output_line({
+            "line": f"Found {len(extracted_candidates)} candidates in the given offset range",
+            "is_info": True
+        })
         return extracted_candidates