Skip to content

Commit

Permalink
Merge pull request #169 from heidi-holappa/development
Browse files Browse the repository at this point in the history
Merge to main
  • Loading branch information
heidi-holappa authored Jul 6, 2023
2 parents 138bee4 + c31f8f8 commit 2944ba1
Show file tree
Hide file tree
Showing 17 changed files with 231 additions and 139 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ offsets.txt
htmlcov/
logs/
dx1-logs/
dx3-logs/
7 changes: 4 additions & 3 deletions arguments_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
"reference_gtf": "<file.gtf>",
"reference_fasta": "<file.fa>",
"gffcompare_gtf": "<file.gtf>",
"offset": 0,
"class_code": "j c k",
"force": false,
"offset": [0, 6],
"class_code": "j c k s x",
"window_size": 8,
"min_reads_for_graph": 100,
"force": false,
"extended_debugging": false
}
2 changes: 1 addition & 1 deletion src/compana.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def run_pipeline(parser_args):
parser_args.reads_bam,
parser_args.reads_tsv,
matching_cases_dict)
bam_manager.execute(parser_args.window_size)
bam_manager.execute(int(parser_args.window_size))

log_manager.execute_log_file_creation(matching_cases_dict, parser_args)

Expand Down
3 changes: 2 additions & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
print("Warning! .env file not found.")

temporary_dir_path = os.getenv("TEMPORARY_DIR") or "temporary_files"
offset_log = os.getenv("OFFSET_LOG") or "offset_log.txt"
offset_log = os.getenv("OFFSET_LOG_FILE") or "debug_offsets.log"
test_file_directory = os.getenv(
"TEST_FILE_DIRECTORY") or "src/tests/files"

Expand All @@ -30,6 +30,7 @@
CIGAR_RESULTS_LOG = os.path.join(LOG_FILE_DIR, cigar_results)
TITLE_FILE_LENGTH = os.getenv("TITLE_FILE_LENGTH") or 50
DEFAULT_WINDOW_SIZE = os.getenv("DEFAULT_WINDOW_SIZE") or 8
CREATE_IMG_N_TRESHOLD = os.getenv("CREATE_IMG_N_TRESHOLD") or 100

if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
Expand Down
27 changes: 10 additions & 17 deletions src/services/alignment_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
import pysam

from services.output_manager import default_output_manager as output_manager
from services.log_manager import default_log_manager as log_manager
from config import DEFAULT_WINDOW_SIZE


class AlignmentParser:

def __init__(self):
Expand All @@ -13,9 +6,6 @@ def __init__(self):
Follows the singleton pattern.
"""

# TODO: add to configuration or to user arguments (window size)
self.window_size = int(DEFAULT_WINDOW_SIZE)

def extract_location_from_cigar_string(self,
cigar_tuples: list,
reference_start: int,
Expand All @@ -25,11 +15,12 @@ def extract_location_from_cigar_string(self,
alignment_position = 0
ref_position = 0

for idx, cigar_code in enumerate(cigar_tuples):
for cigar_code in cigar_tuples:

if cigar_code[0] in [0, 2, 3, 7, 8]:
ref_position += cigar_code[1]
if ref_position <= relative_position and not reference_start + ref_position == reference_end:
if ref_position <= relative_position and not \
reference_start + ref_position == reference_end:
alignment_position += cigar_code[1]
else:
return alignment_position + (cigar_code[1] - (ref_position - relative_position))
Expand All @@ -39,7 +30,8 @@ def extract_location_from_cigar_string(self,
def count_indels_from_cigar_codes_in_given_window(self,
cigar_tuples: list,
aligned_location: int,
loc_type: str):
loc_type: str,
window_size: int):
"""
Get cigar codes in a given window.
Expand All @@ -62,16 +54,17 @@ def count_indels_from_cigar_codes_in_given_window(self,
debug_list = []

if loc_type == "end":
aligned_location = aligned_location - self.window_size + 1
aligned_location = aligned_location - window_size + 1

for cigar_code in cigar_tuples:
if self.window_size == len(cigar_code_list):
if window_size == len(cigar_code_list):
break
if location + cigar_code[1] > aligned_location:
overlap = location + \
cigar_code[1] - (aligned_location + len(cigar_code_list))
cigar_code_list.extend(
[cigar_code[0] for _ in range(min(self.window_size - len(cigar_code_list), overlap))])
[cigar_code[0] for _ in range(min(window_size -
len(cigar_code_list), overlap))])
location += cigar_code[1]

debug_list.append(cigar_code_list)
Expand All @@ -84,7 +77,7 @@ def count_indels_from_cigar_codes_in_given_window(self,
result['deletions'] = deletions
result['insertions'] = insertions

if deletions >= self.window_size or insertions >= self.window_size:
if deletions >= window_size or insertions >= window_size:
debug_list.append(
f"deletions: {deletions}, insertions: {insertions}")
return True, debug_list, result
Expand Down
14 changes: 8 additions & 6 deletions src/services/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import json

from config import DEFAULT_WINDOW_SIZE
from config import CREATE_IMG_N_TRESHOLD


def init_argparser():
parser = argparse.ArgumentParser(
description='compAna: a tool for comparing annotations',
usage='python3 compAna.py -i <input.gtf> [-f] [-s]'
usage='python3 src/compana.py -g <gffcompare_gtf> -r <reference_gtf> -a <reference_fasta> [-o <offset>] [-f] [-s] [-c <class_code>] [-j <json>] [-b <reads_bam>] [-t <reads_tsv>] [-w <window_size>] [-e]'
)
parser.add_argument(
'-g', '--gffcompare_gtf',
Expand Down Expand Up @@ -56,11 +57,16 @@ def init_argparser():
parser.add_argument(
'-w', '--window_size',
help='window from which indels and mismatches are to be searched in interesting locations',
metavar='')
metavar='',
default=DEFAULT_WINDOW_SIZE)
parser.add_argument(
'-e', '--extended_debug',
help='enable extended debug output',
action='store_true')
parser.add_argument(
'-m', '--min_reads_for_graph',
help='threshold for the n of cases for creating images',
default=CREATE_IMG_N_TRESHOLD)

parser_args = parser.parse_args()
parser_dict = vars(parser_args)
Expand All @@ -80,7 +86,6 @@ def init_argparser():
parser_dict["offset"] = (0, float('inf'))
else:
offset_range = []
print(parser_args.offset)
for element in parser_args.offset:
if isinstance(element, str) and len(element) == 3:
offset_range.append(float('inf'))
Expand All @@ -90,7 +95,4 @@ def init_argparser():
max(0, offset_range[0]), max(0, offset_range[1])
)

if not parser_args.window_size:
parser_dict["window_size"] = DEFAULT_WINDOW_SIZE

return parser_args
40 changes: 19 additions & 21 deletions src/services/bam_manager.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,31 @@
import os
import pysam

from services.read_management import create_dict_of_transcripts_and_reads
from services.output_manager import default_output_manager as output_manager
from services.alignment_parser import default_alignment_parser as alignment_parser
from services.log_manager import default_log_manager as log_manager

from config import LOG_FILE_DIR


class BamManager:

def __init__(self,
bam_path: str,
tsv_path: str,
matching_cases_dict: dict):
# pylint: disable=no-member

self.bam_path = bam_path
self.tsv_path = tsv_path
self.samfile = pysam.AlignmentFile(self.bam_path, "rb")

self.matching_cases_dict = matching_cases_dict
self.reads_and_transcripts = {}

def initialize_file(self, filename: str):
# pylint: disable=no-member
self.samfile = pysam.AlignmentFile(filename, "rb")

def process_bam_file(self, reads_and_references: dict):
def process_bam_file(self, reads_and_references: dict, window_size: int):
output_manager.output_line({
"line": "Iterating reads and counting indels. This may take a while.",
"is_info": True
})
count = 0
errors = []
set_of_processed_reads = set()
Expand Down Expand Up @@ -61,7 +60,8 @@ def process_bam_file(self, reads_and_references: dict):

idx_corrected_location = location - 1

if read.reference_start > idx_corrected_location or read.reference_end < idx_corrected_location:
if read.reference_start > idx_corrected_location \
or read.reference_end < idx_corrected_location:
errors.append(
f"Non-matching location: {read.query_name}, {matching_case_key}\t")
continue
Expand All @@ -83,7 +83,8 @@ def process_bam_file(self, reads_and_references: dict):
error, debug_list, result = alignment_parser.count_indels_from_cigar_codes_in_given_window(
read.cigartuples,
aligned_location,
loc_type)
loc_type,
window_size)
for key, value in result.items():
if value not in self.matching_cases_dict[matching_case_key]['indel_errors'][key]:
self.matching_cases_dict[matching_case_key]['indel_errors'][key][value] = 0
Expand All @@ -99,17 +100,17 @@ def process_bam_file(self, reads_and_references: dict):
output_manager.output_line({
"line": str(prev_processed_reads_counter) +
" iterations extracted an already processed read from the BAM-file",
"is_error": True,
"is_warning": True,
})

output_manager.output_line({
"line": "\nFinished",
"line": "Processing BAM-file finished.",
"is_info": True
})

def output_heading_information(self):
output_manager.output_line({
"line": "PROCESSING BAM-FILE",
"line": "ITERATE READS",
"is_title": True
})
output_manager.output_line({
Expand Down Expand Up @@ -156,18 +157,16 @@ def generate_dictionaries(self):
log_manager.debug_logs['transcripts_and_reads'] = transcripts_and_reads
log_manager.debug_logs['reads_and_references'] = reads_and_references

output_manager.output_line({
"line": "NUMBER OF MATCHING CASES:" + str(len(self.matching_cases_dict)),
"is_info": True
})
cases_line = f"Number of matching cases: {len(self.matching_cases_dict)}, " + \
f"number of reads: {len(reads_and_references)}"

output_manager.output_line({
"line": "NUMBER OF READS: " + str(len(reads_and_references)),
"line": cases_line,
"is_info": True
})

output_manager.output_line({
"line": "Analyzing offset of reads. This may take a while.",
"line": "Note: one read may be related to multiple matching cases.",
"is_info": True
})

Expand All @@ -179,5 +178,4 @@ def execute(self, window_size: int):

reads_and_references = self.generate_dictionaries()

self.initialize_file(self.bam_path)
self.process_bam_file(reads_and_references)
self.process_bam_file(reads_and_references, window_size)
6 changes: 4 additions & 2 deletions src/services/db_initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@ def init_databases(gffcompare_gtf: str, reference_gtf: str, force=False) -> tupl

if not force and db_exists:
output_manager.output_line({
"line": f"{key}: database file already exists. Using existing database. Use -f to force overwriting existing db-files.",
"line": f"{key}: database file already exists. Using existing database. " +
"Use -f to force overwriting existing db-files.",
"is_info": True
})

else:
output_manager.output_line({
"line": f"{key}: database file does not exist. Creating database. This might take a while.",
"line": f"{key}: database file does not exist. " +
"Creating database. This might take a while.",
"is_info": True
})
gffutils.create_db(
Expand Down
56 changes: 37 additions & 19 deletions src/services/extract_matching_cases.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from services.output_manager import default_output_manager as output_manager


class MatchingCasesExtractor:

def __init__(self, offset_results: dict, offset_range: tuple, reference_db):
"""
Extract candidates matching the selected offset.
Extract candidates matching the selected offset. Currently first and last
exon are not considered. These are omitted with the magic numbers in the
second for loop.
Args:
offset_results (dict): a dictionary containing the offset results for each transcript
Expand All @@ -14,42 +19,55 @@ def __init__(self, offset_results: dict, offset_range: tuple, reference_db):
self.reference_db = reference_db

def extract_candidates_matching_selected_offset(self) -> dict:
output_manager.output_line({
"line": "EXTRACTING CASES",
"is_title": True
})
output_manager.output_line({
"line": f"Extracting candidates in the given offset range {self.offset_range}",
"is_info": True
})
extracted_candidates = {}
for key, value in self.offset_results.items():
strand, offsets = value["strand"], value["offsets"]
reference_id, transcript_id = value["reference_id"], key
if strand == "-":
offsets.reverse()
for transcript_id, value in self.offset_results.items():
strand, offsets, reference_id = value["strand"], value["offsets"], value["reference_id"]
for offset_exon_idx in range(1, len(offsets)-1):
offset_exon_number = offset_exon_idx + 1
if abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and \
abs(offsets[offset_exon_idx][0]) <= self.offset_range[1]:
start_is_in_range = bool(
abs(offsets[offset_exon_idx][0]) >= self.offset_range[0] and
abs(offsets[offset_exon_idx][0]) <= self.offset_range[1])
if start_is_in_range:
entry_key = transcript_id + ".exon_" + \
str(offset_exon_number) + ".start" + \
str(offset_exon_idx) + ".start" + \
'.offset_' + str(offsets[offset_exon_idx][0])
for exon in self.reference_db.children(reference_id, featuretype='exon'):
if int(exon['exon_number'][0]) == offset_exon_number:
for exon_number, exon in enumerate(self.reference_db.children(
reference_id, featuretype='exon', order_by='start')):
if exon_number == offset_exon_idx:
extracted_candidates[entry_key] = {
"transcript_id": transcript_id,
"strand": strand,
"exon_number": offset_exon_number,
"exon_number": offset_exon_idx,
"location_type": "start",
"location": exon.start + offsets[offset_exon_idx][0],
"offset": offsets[offset_exon_idx][0]
}
if abs(offsets[offset_exon_idx][1]) >= self.offset_range[0] and \
abs(offsets[offset_exon_idx][1]) <= self.offset_range[1]:
end_is_in_range = bool(abs(offsets[offset_exon_idx][1]) >= self.offset_range[0] and
abs(offsets[offset_exon_idx][1]) <= self.offset_range[1])
if end_is_in_range:
entry_key = transcript_id + ".exon_" + \
str(offset_exon_number) + ".end" + \
str(offset_exon_idx) + ".end" + \
'.offset_' + str(offsets[offset_exon_idx][1])
for exon in self.reference_db.children(reference_id, featuretype='exon'):
if int(exon['exon_number'][0]) == offset_exon_number:
for exon_number, exon in enumerate(self.reference_db.children(
reference_id, featuretype='exon', order_by='start')):
if exon_number == offset_exon_idx:
extracted_candidates[entry_key] = {
"transcript_id": transcript_id,
"strand": strand,
"exon_number": offset_exon_number,
"exon_number": offset_exon_idx,
"location_type": "end",
"location": exon.end + offsets[offset_exon_idx][1],
"offset": offsets[offset_exon_idx][1]
}
output_manager.output_line({
"line": f"Found {len(extracted_candidates)} candidates in the given offset range",
"is_info": True
})
return extracted_candidates
Loading

0 comments on commit 2944ba1

Please sign in to comment.