From 9d316637325faa8bab8219410190b5e7caab6c62 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Tue, 7 Jan 2025 10:14:34 -0600 Subject: [PATCH] Prevent pVACvector from clipping Best Peptide --- pvactools/lib/fasta_generator.py | 38 +++-- .../tools/pvacseq/generate_protein_fasta.py | 57 ++++---- .../input.aggregated.tsv | 48 +++---- .../output_with_aggregated_tsv.fasta | 136 +++++++++--------- ...tor.prevent_clipping_best_peptide.input.fa | 16 +++ tests/test_output_parser.py | 3 - tests/test_pvacvector.py | 38 +++++ 7 files changed, 198 insertions(+), 138 deletions(-) create mode 100644 tests/test_data/pvacvector/Test.vector.prevent_clipping_best_peptide.input.fa diff --git a/pvactools/lib/fasta_generator.py b/pvactools/lib/fasta_generator.py index cc0a78b4c..81ded11b3 100644 --- a/pvactools/lib/fasta_generator.py +++ b/pvactools/lib/fasta_generator.py @@ -7,6 +7,8 @@ from abc import ABCMeta from Bio import SeqIO import itertools +import logging + from pvactools.lib.proximal_variant import ProximalVariant csv.field_size_limit(sys.maxsize) @@ -364,13 +366,21 @@ def __init__(self, **kwargs): def execute(self): seq_dict = dict() + best_peptide = dict() for record in SeqIO.parse(self.input_file, "fasta"): seq_dict[record.id] = str(record.seq) + description = record.description.replace("{} ".format(record.id), "") + if description != "": + try: + best_peptide[record.id] = json.loads(description)['Best Peptide'] + except: + pass for length in self.epitope_lengths: epitopes = dict() fasta_sequences = OrderedDict() wingspan_length = length - 1 + warnings = set() for (seq1, seq2) in self.junctions_to_test: seq1_seq = seq_dict[seq1] seq2_seq = seq_dict[seq2] @@ -379,6 +389,19 @@ def execute(self): #These combinations would've already been tested in previous attempts with lower clip lengths and can be skipped if left_clip_length < self.clip_length and right_clip_length < self.clip_length: continue + if seq1 in best_peptide: + seq1_best_peptide = best_peptide[seq1] + last_position = seq1_seq.rindex(seq1_best_peptide) + len(seq1_best_peptide) + end_distance = len(seq1_seq) - last_position + if left_clip_length > end_distance: + warnings.add("Clipping {} amino acids off the end of peptide {} would clip the best peptide. Skipping.".format(left_clip_length, seq1)) + continue + if seq2 in best_peptide: + seq2_best_peptide = best_peptide[seq2] + first_position = seq2_seq.index(seq2_best_peptide) + if right_clip_length > first_position: + warnings.add("Clipping {} amino acids off the start of peptide {} would clip the best peptide. Skipping.".format(right_clip_length, seq2)) + continue trunc_seq1 = seq1_seq[(len(seq1_seq) - wingspan_length):(len(seq1_seq) - left_clip_length)] trunc_seq2 = seq2_seq[(0 + right_clip_length):wingspan_length] @@ -388,6 +411,8 @@ def execute(self): else: seq_ID = "{}|{}|{}|{}".format(seq1, left_clip_length, right_clip_length, seq2) epitopes[seq_ID] = trunc_seq1 + trunc_seq2 + for warning in list(warnings): + logging.info(warning) for seq_id in epitopes: sequence = epitopes[seq_id] @@ -409,16 +434,3 @@ def execute(self): writer.close() key_writer.close() - - def combine_problematic_peptides(self, seq_dict): - seq_tuples = [] - for (seq_id, data) in seq_dict.items(): - other_seq_ids = list(seq_dict.keys()) - other_seq_ids.remove(seq_id) - if data['problematic_start']: - for other_seq_id in other_seq_ids: - seq_tuples.append((other_seq_id, seq_id)) - if data['problematic_end']: - for other_seq_id in other_seq_ids: - seq_tuples.append((seq_id, other_seq_id)) - return list(set(seq_tuples)) diff --git a/pvactools/tools/pvacseq/generate_protein_fasta.py b/pvactools/tools/pvacseq/generate_protein_fasta.py index e5fed3618..35af041d6 100644 --- a/pvactools/tools/pvacseq/generate_protein_fasta.py +++ b/pvactools/tools/pvacseq/generate_protein_fasta.py @@ -6,6 +6,7 @@ import yaml import csv import re +import json from collections import OrderedDict from Bio import SeqIO from Bio.Seq import Seq @@ -125,14 +126,15 @@ def generate_fasta(flanking_sequence_length, downstream_sequence_length, temp_di def parse_input_tsv(input_tsv): if input_tsv is None: return (None, None) + indexes = [] with open(input_tsv, 'r') as fh: reader = csv.DictReader(fh, delimiter = "\t") - if 'Index' in reader.fieldnames: - indexes = parse_full_input_tsv(reader) - file_type = 'full' - else: + if 'Best Peptide' in reader.fieldnames: indexes = parse_aggregated_input_tsv(reader) file_type = 'aggregated' + else: + indexes = parse_full_input_tsv(reader) + file_type = 'full' return (indexes, file_type) def parse_full_input_tsv(reader): @@ -142,9 +144,9 @@ def parse_full_input_tsv(reader): return indexes def parse_aggregated_input_tsv(reader): - indexes = [] + indexes = {} for line in reader: - indexes.append(line) + indexes[line['Index']] = line return indexes def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_evaluation): @@ -155,7 +157,7 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_ with open(fasta_key_file_path, 'r') as fasta_key_file: keys = yaml.load(fasta_key_file, Loader=yaml.FullLoader) - (tsv_indexes, tsv_file_type) = parse_input_tsv(input_tsv) + (tsv_indexes, file_type) = parse_input_tsv(input_tsv) dataframe = OrderedDict() output_records = [] @@ -165,33 +167,28 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_ if mutant_only and record_id.startswith('WT.'): continue if tsv_indexes is not None: - if tsv_file_type == 'full': - sequence_type, index = record_id.split('.', 1) - if index not in tsv_indexes: - continue - else: - (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2) - (peptide_type, count, gene, transcript) = rest_record_id.split(".", 3) - (parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type) - matches = [i for i in tsv_indexes if i['Best Transcript'] == transcript and i['AA Change'] == parsed_aa_change and i['Evaluation'] in aggregate_report_evaluation] - if len(matches) == 0: - continue - new_record = SeqRecord(record.seq, id=record_id, description=record_id) - output_records.append(new_record) + sequence_type, index = record_id.split('.', 1) + if file_type == 'full': + if index in tsv_indexes: + new_record = SeqRecord(record.seq, id=record_id, description=record_id) + output_records.append(new_record) + elif file_type == 'aggregated': + if index in tsv_indexes.keys() and tsv_indexes[index]['Evaluation'] in aggregate_report_evaluation: + if record_id.startswith('MT.'): + annotations = { 'Best Peptide': tsv_indexes[index]['Best Peptide'] } + new_record = SeqRecord(record.seq, id=record_id, description=json.dumps(annotations)) + else: + new_record = SeqRecord(record.seq, id=record_id, description=record_id) + output_records.append(new_record) + else: + new_record = SeqRecord(record.seq, id=record_id, description=record_id) + output_records.append(new_record) if tsv_indexes is not None: ordered_output_records = [] for tsv_index in tsv_indexes: - if tsv_file_type == 'full': - records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index] - ordered_output_records.extend(records) - else: - for output_record in output_records: - (rest_record_id, variant_type, aa_change) = output_record.id.rsplit(".", 2) - (peptide_type, count, gene, transcript) = rest_record_id.split(".", 3) - (parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type) - if tsv_index['Best Transcript'] == transcript and tsv_index['AA Change'] == parsed_aa_change: - ordered_output_records.append(output_record) + records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index] + ordered_output_records.extend(records) output_records = ordered_output_records SeqIO.write(output_records, output_file, "fasta") diff --git a/tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv b/tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv index 2b49da546..fa174c5bf 100644 --- a/tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv +++ b/tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv @@ -1,24 +1,24 @@ -ID A*02:01 B*35:01 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation -2-217498305-217498305-T-TGCTGCC 9 4 IGFBP2 L20LLP 1 PLLPLLPLL ENST00000233809.4 Not Supported HLA-A*02:01 7 None 12 129.750 221.227 1.400 2.5 NA NA NA NA 0.891 Pass Pending -22-38119219-38119220-GA-G 1 TRIOBP FS219 1 CPWSGTGQH ENST00000406386.3 Not Supported HLA-B*35:01 0 None 1 212.962 NA 1.500 NA NA NA NA NA 0.768 Pass Pending -22-50615580-50615581-C-T 6 3 PANX2 S147F 1 ALGWEFLAFT ENST00000395842.2 Not Supported HLA-A*02:01 9 None 7 168.895 227.61 1.693 1.707 NA NA NA NA 0.959 Anchor Pending -22-39994238-39994239-G-A 3 3 CACNA1I C107Y 1 YLSDRCKIL ENST00000402142.3 Not Supported HLA-A*02:01 1 None 6 134.880 2639.458 2.100 8.2 NA NA NA NA 0.043 Subclonal Pending -6-41754573-41754573-C-CCTT 4 PRICKLE4 -287-288L 1 TLSRTLLLAA ENST00000458694.1 Not Supported HLA-A*02:01 8 None 4 312.720 332.88 2.900 2.8 NA NA NA NA 0.158 Subclonal Pending -22-37771017-37771018-G-A 5 3 ELFN2 P186L 1 NLFNCECDL ENST00000402918.2 Not Supported HLA-A*02:01 2 None 7 387.055 27322.265 2.000 40.0 NA NA NA NA 0.135 Subclonal Pending -22-18644672-18644673-C-T 2 3 USP18 A124V 1 RPLELVYCL ENST00000215794.7 Not Supported HLA-B*35:01 6 None 5 471.710 399.275 1.100 0.9 NA NA NA NA 0.053 Subclonal Pending -22-46653595-46653596-G-A 3 4 PKDREJ T1875I 1 YSYGLLHIY ENST00000253255.5 Not Supported HLA-B*35:01 8 None 6 69.335 39.921 0.890 0.43 NA NA NA NA 0.233 Poor Pending -4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A 2 1 RBM47 AAAAAAAA495-502A 1 SAAAAAAAV ENST00000381793.2 Not Supported HLA-B*35:01 9 None 2 508.630 427.517 2.600 2.5 NA NA NA NA 0.977 Poor Pending -22-22550509-22550510-T-G 2 IGLV6-57 S63A 1 GSAPTTVIY ENST00000390285.3 Not Supported HLA-B*35:01 3 None 2 567.080 901.875 3.400 4.1 NA NA NA NA 0.571 Poor Pending -22-50682229-50682230-T-C 2 3 TUBGCP6 H220R 1 SLFGALVRS ENST00000248846.5 Not Supported HLA-A*02:01 8 None 5 623.290 393.78 4.398 3.6 NA NA NA NA 0.686 Poor Pending -22-38027027-38027028-C-G 1 1 GGA1 P484A 1 LLHTVSPEPA ENST00000343632.4 Not Supported HLA-A*02:01 10 None 2 723.710 7116.565 3.000 18.0 NA NA NA NA 0.486 Poor Pending -22-19175521-19175522-G-T 2 CLTCL1 H1469N 1 SVNEALNNL ENST00000263200.10 Not Supported HLA-A*02:01 8 None 2 1010.045 793.83 4.898 4.9 NA NA NA NA 0.100 Poor Pending -22-41920894-41920895-G-C 1 ACO2 E510Q 1 NPQTDYLTG ENST00000216254.4 Not Supported HLA-B*35:01 3 None 1 3121.315 3595.485 6.602 6.5 NA NA NA NA 0.250 Poor Pending -22-41895790-41895791-C-A 1 ACO2 A33E 1 EMSHFEPNEY ENST00000216254.4 Not Supported HLA-B*35:01 1 None 1 3903.700 4456.03 2.600 3.7 NA NA NA NA 0.044 Poor Pending -22-26936775-26936776-G-T 1 TPST2 P274H 1 DLIGKHGGV ENST00000338754.4 Not Supported HLA-A*02:01 6 None 1 4145.685 3402.989 13.000 7.7 NA NA NA NA 0.179 Poor Pending -22-20709231-20709232-G-C FAM230A E322Q 0 IANQDAAQG ENST00000434783.3 Not Supported HLA-B*35:01 4 None 0 5738.350 5541.375 10.500 7.674 NA NA NA NA 0.500 Poor Pending -22-50869713-50869714-C-A PPP6R2 S414Y 0 YESRVEPPH ENST00000395741.3 Not Supported HLA-B*35:01 1 None 0 6159.300 18030.513 14.000 33.0 NA NA NA NA 0.043 Poor Pending -22-22550449-22550450-C-G IGLV6-57 R43G 0 ISCTGSSGSI ENST00000390285.3 Not Supported HLA-A*02:01 5 None 0 9672.631 20663.37 30.000 34.0 NA NA NA NA 1.000 Poor Pending -22-18020271-18020272-G-A CECR2 R535H 0 SGGSHVWTH ENST00000262608.8 Not Supported HLA-B*35:01 9 None 0 10761.970 30462.795 24.000 42.0 NA NA NA NA 0.071 Poor Pending -22-29886116-29886117-C-A NEFH P830T 0 SPVKEEEKT ENST00000310624.6 Not Supported HLA-B*35:01 9 None 0 12628.540 17588.29 26.000 32.0 NA NA NA NA 0.038 Poor Pending -22-37966274-37966275-C-G LGALS2 E132Q 0 FNMSSFKLKQ ENST00000215886.4 Not Supported HLA-A*02:01 10 None 0 16963.785 16973.74 33.000 39.0 NA NA NA NA 0.496 Poor Pending -22-50555769-50555770-G-A MOV10L1 A482T 0 SAKTTVVVTT ENST00000262794.5 Not Supported HLA-A*02:01 10 None 0 27206.914 17665.8 40.000 26.0 NA NA NA NA 0.042 Poor Pending +ID Index E*01:01 G*01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation +22-41920894-41920895-G-C 19.ACO2.ENST00000216254.4.missense.510E/Q 2 1 ACO2 E510Q 1 KFNPQTDYL ENST00000216254.4 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor Pending +22-22550509-22550510-T-G 10.IGLV6-57.ENST00000390285.3.missense.63S/A 2 1 IGLV6-57 S63A 1 QRPGSAPTT ENST00000390285.3 Not Supported HLA-E*01:01 6 None 3 1362.110 1517.76 0.300 0.3 NA NA NA NA 0.571 Poor Pending +22-46653595-46653596-G-A 20.PKDREJ.ENST00000253255.5.missense.1875T/I 1 5 PKDREJ T1875I 1 LYYSYGLLHI ENST00000253255.5 Not Supported HLA-G*01:09 10 None 6 1469.280 2365.16 0.300 0.4 NA NA NA NA 0.233 Poor Pending +22-38027027-38027028-C-G 15.GGA1.ENST00000343632.4.missense.484P/A 2 GGA1 P484A 1 ARPPQQPVP ENST00000343632.4 Not Supported HLA-E*01:01 1 None 2 1654.990 4242.33 0.400 4.8 NA NA NA NA 0.486 Poor Pending +22-39994238-39994239-G-A 17.CACNA1I.ENST00000402142.3.missense.107C/Y 2 1 CACNA1I C107Y 1 YQPCDDMDY ENST00000402142.3 Not Supported HLA-E*01:01 9 None 3 1864.160 1804.62 0.600 0.6 NA NA NA NA 0.043 Poor Pending +6-41754573-41754573-C-CCTT 3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L 1 1 PRICKLE4 -287-288L 1 ATLSRTLLL ENST00000458694.1 Not Supported HLA-E*01:01 9 None 1 2122.610 3272.12 0.120 5.1 NA NA NA NA 0.158 Poor Pending +22-18020271-18020272-G-A 5.CECR2.ENST00000262608.8.missense.535R/H 1 CECR2 R535H 1 WTHSRDPEG ENST00000262608.8 Not Supported HLA-E*01:01 3 None 1 2523.800 1765.99 1.400 0.5 NA NA NA NA 0.071 Poor Pending +2-217498305-217498305-T-TGCTGCC 1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP 2 1 IGFBP2 L20LLP 1 LLPLLPLLL ENST00000233809.4 Not Supported HLA-E*01:01 6-7 None 2 2551.250 3099.81 0.170 0.33 NA NA NA NA 0.891 Poor Pending +22-18644672-18644673-C-T 6.USP18.ENST00000215794.7.missense.124A/V 1 USP18 A124V 1 LVYCLQKCN ENST00000215794.7 Not Supported HLA-G*01:09 2 None 1 3099.810 6399.79 3.301 12.0 NA NA NA NA 0.053 Poor Pending +22-50615580-50615581-C-T 22.PANX2.ENST00000395842.2.missense.147S/F 1 1 PANX2 S147F 1 FLAFTRLTS ENST00000395842.2 Not Supported HLA-E*01:01 4 None 2 3343.700 3202.08 2.699 2.4 NA NA NA NA 0.959 Poor Pending +22-37771017-37771018-G-A 13.ELFN2.ENST00000402918.2.missense.186P/L 1 ELFN2 P186L 1 MVCELAGNL ENST00000402918.2 Not Supported HLA-G*01:09 9 None 1 3454.020 10191.11 4.102 21.0 NA NA NA NA 0.135 Poor Pending +22-37966274-37966275-C-G 14.LGALS2.ENST00000215886.4.missense.132E/Q 1 LGALS2 E132Q 1 NMSSFKLKQ ENST00000215886.4 Not Supported HLA-E*01:01 9 None 1 3890.570 3848.7 4.000 3.9 NA NA NA NA 0.496 Poor Pending +22-41895790-41895791-C-A 18.ACO2.ENST00000216254.4.missense.33A/E 2 ACO2 A33E 1 EMSHFEPNE ENST00000216254.4 Not Supported HLA-E*01:01 1 None 2 3932.890 2443.19 4.199 1.3 NA NA NA NA 0.044 Poor Pending +22-50682229-50682230-T-C 23.TUBGCP6.ENST00000248846.5.missense.220H/R 1 1 TUBGCP6 H220R 1 RSRTYDMDV ENST00000248846.5 Not Supported HLA-G*01:09 1 None 2 4576.120 7609.38 6.301 15.0 NA NA NA NA 0.686 Poor Pending +22-19175521-19175522-G-T 7.CLTCL1.ENST00000263200.10.missense.1469H/N 1 CLTCL1 H1469N 1 SVNEALNNL ENST00000263200.10 Not Supported HLA-G*01:09 8 None 1 4989.870 7775.84 7.301 15.0 NA NA NA NA 0.100 Poor Pending +22-50869713-50869714-C-A 24.PPP6R2.ENST00000395741.3.missense.414S/Y PPP6R2 S414Y 0 GYESRVEPP ENST00000395741.3 Not Supported HLA-G*01:09 2 None 0 5620.540 22209.93 8.703 44.0 NA NA NA NA 0.043 Poor Pending +22-20709231-20709232-G-C 8.FAM230A.ENST00000434783.3.missense.322E/Q FAM230A E322Q 0 ANQDAAQGI ENST00000434783.3 Not Supported HLA-G*01:09 3 None 0 5681.680 9654.43 8.797 20.0 NA NA NA NA 0.500 Poor Pending +22-26936775-26936776-G-T 11.TPST2.ENST00000338754.4.missense.274P/H TPST2 P274H 0 KHGGVSLSK ENST00000338754.4 Not Supported HLA-E*01:01 2 None 0 6539.790 20149.14 13.000 53.0 NA NA NA NA 0.179 Poor Pending +22-29886116-29886117-C-A 12.NEFH.ENST00000310624.6.missense.830P/T NEFH P830T 0 KTQEVKVKE ENST00000310624.6 Not Supported HLA-G*01:09 2 None 0 7366.350 25842.4 14.000 50.0 NA NA NA NA 0.038 Poor Pending +22-38119219-38119220-GA-G 16.TRIOBP.ENST00000406386.3.FS.219GA/G TRIOBP FS219 0 GEKAGCPWS ENST00000406386.3 Not Supported HLA-E*01:01 0-10 None 0 8119.760 NA 18.000 NA NA NA NA NA 0.768 Poor Pending +22-22550449-22550450-C-G 9.IGLV6-57.ENST00000390285.3.missense.43R/G IGLV6-57 R43G 0 KTVTISCTG ENST00000390285.3 Not Supported HLA-G*01:09 9 None 0 9447.760 15208.39 19.000 31.0 NA NA NA NA 1.000 Poor Pending +22-50555769-50555770-G-A 21.MOV10L1.ENST00000262794.5.missense.482A/T MOV10L1 A482T 0 KTTVVVTTQ ENST00000262794.5 Not Supported HLA-G*01:09 8 None 0 10874.650 16763.86 23.000 34.0 NA NA NA NA 0.042 Poor Pending +4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A 2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A RBM47 AAAAAAAA495-502A 0 SAAAAAAAV ENST00000381793.2 Not Supported HLA-E*01:01 8-9 None 0 21040.320 21040.32 21.000 30.0 NA NA NA NA 0.977 Poor Pending diff --git a/tests/test_data/pvacseq_generate_protein_fasta/output_with_aggregated_tsv.fasta b/tests/test_data/pvacseq_generate_protein_fasta/output_with_aggregated_tsv.fasta index 1f7141f2f..341c55f40 100644 --- a/tests/test_data/pvacseq_generate_protein_fasta/output_with_aggregated_tsv.fasta +++ b/tests/test_data/pvacseq_generate_protein_fasta/output_with_aggregated_tsv.fasta @@ -1,92 +1,92 @@ ->WT.1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP -LPLPPPPLLPLLLLLLGASGGGG ->MT.1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP -LPLPPPPLLPLLPLLLLLGASGGGG ->WT.16.TRIOBP.ENST00000406386.3.FS.219GA/G -EDTGGGGRSAGQHWARLRGE ->MT.16.TRIOBP.ENST00000406386.3.FS.219GA/G -EDTGGGGRSAGSTGQGSGEKAGCPWSGTGQH ->WT.22.PANX2.ENST00000395842.2.missense.147S/F -VPALGWEFLASTRLTSELNFL ->MT.22.PANX2.ENST00000395842.2.missense.147S/F -VPALGWEFLAFTRLTSELNFL +>WT.19.ACO2.ENST00000216254.4.missense.510E/Q +AIAGTLKFNPETDYLTGTDGK +>MT.19.ACO2.ENST00000216254.4.missense.510E/Q {"Best Peptide": "KFNPQTDYL"} +AIAGTLKFNPQTDYLTGTDGK +>WT.10.IGLV6-57.ENST00000390285.3.missense.63S/A +VQWYQQRPGSSPTTVIYEDNQ +>MT.10.IGLV6-57.ENST00000390285.3.missense.63S/A {"Best Peptide": "QRPGSAPTT"} +VQWYQQRPGSAPTTVIYEDNQ +>WT.20.PKDREJ.ENST00000253255.5.missense.1875T/I +WLYYSYGLLHTYGSGGYALYF +>MT.20.PKDREJ.ENST00000253255.5.missense.1875T/I {"Best Peptide": "LYYSYGLLHI"} +WLYYSYGLLHIYGSGGYALYF +>WT.15.GGA1.ENST00000343632.4.missense.484P/A +SLLHTVSPEPPRPPQQPVPTE +>MT.15.GGA1.ENST00000343632.4.missense.484P/A {"Best Peptide": "ARPPQQPVP"} +SLLHTVSPEPARPPQQPVPTE >WT.17.CACNA1I.ENST00000402142.3.missense.107C/Y GMYQPCDDMDCLSDRCKILQV ->MT.17.CACNA1I.ENST00000402142.3.missense.107C/Y +>MT.17.CACNA1I.ENST00000402142.3.missense.107C/Y {"Best Peptide": "YQPCDDMDY"} GMYQPCDDMDYLSDRCKILQV >WT.3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L VNSATLSRTLLAAAGGSSLQT ->MT.3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L +>MT.3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L {"Best Peptide": "ATLSRTLLL"} VNSATLSRTLLLAAAGGSSLQT ->WT.13.ELFN2.ENST00000402918.2.missense.186P/L -SLMVCELAGNPFNCECDLFGF ->MT.13.ELFN2.ENST00000402918.2.missense.186P/L -SLMVCELAGNLFNCECDLFGF +>WT.5.CECR2.ENST00000262608.8.missense.535R/H +GRSGGSHVWTRSRDPEGSSRK +>MT.5.CECR2.ENST00000262608.8.missense.535R/H {"Best Peptide": "WTHSRDPEG"} +GRSGGSHVWTHSRDPEGSSRK +>WT.1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP +LPLPPPPLLPLLLLLLGASGGGG +>MT.1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP {"Best Peptide": "LLPLLPLLL"} +LPLPPPPLLPLLPLLLLLGASGGGG >WT.6.USP18.ENST00000215794.7.missense.124A/V RQKAVRPLELAYCLQKCNVPL ->MT.6.USP18.ENST00000215794.7.missense.124A/V +>MT.6.USP18.ENST00000215794.7.missense.124A/V {"Best Peptide": "LVYCLQKCN"} RQKAVRPLELVYCLQKCNVPL ->WT.20.PKDREJ.ENST00000253255.5.missense.1875T/I -WLYYSYGLLHTYGSGGYALYF ->MT.20.PKDREJ.ENST00000253255.5.missense.1875T/I -WLYYSYGLLHIYGSGGYALYF ->WT.2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A -DPASAAAAAAAAAAAAAAVIPTVSTPPP ->MT.2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A -DPASAAAAAAAVIPTVSTPPP ->WT.10.IGLV6-57.ENST00000390285.3.missense.63S/A -VQWYQQRPGSSPTTVIYEDNQ ->MT.10.IGLV6-57.ENST00000390285.3.missense.63S/A -VQWYQQRPGSAPTTVIYEDNQ +>WT.22.PANX2.ENST00000395842.2.missense.147S/F +VPALGWEFLASTRLTSELNFL +>MT.22.PANX2.ENST00000395842.2.missense.147S/F {"Best Peptide": "FLAFTRLTS"} +VPALGWEFLAFTRLTSELNFL +>WT.13.ELFN2.ENST00000402918.2.missense.186P/L +SLMVCELAGNPFNCECDLFGF +>MT.13.ELFN2.ENST00000402918.2.missense.186P/L {"Best Peptide": "MVCELAGNL"} +SLMVCELAGNLFNCECDLFGF +>WT.14.LGALS2.ENST00000215886.4.missense.132E/Q +SHLSYLSVRGGFNMSSFKLKE +>MT.14.LGALS2.ENST00000215886.4.missense.132E/Q {"Best Peptide": "NMSSFKLKQ"} +SHLSYLSVRGGFNMSSFKLKQ +>WT.18.ACO2.ENST00000216254.4.missense.33A/E +ASVLCQRAKVAMSHFEPNEYI +>MT.18.ACO2.ENST00000216254.4.missense.33A/E {"Best Peptide": "EMSHFEPNE"} +ASVLCQRAKVEMSHFEPNEYI >WT.23.TUBGCP6.ENST00000248846.5.missense.220H/R TRVSLFGALVHSRTYDMDVRL ->MT.23.TUBGCP6.ENST00000248846.5.missense.220H/R +>MT.23.TUBGCP6.ENST00000248846.5.missense.220H/R {"Best Peptide": "RSRTYDMDV"} TRVSLFGALVRSRTYDMDVRL ->WT.15.GGA1.ENST00000343632.4.missense.484P/A -SLLHTVSPEPPRPPQQPVPTE ->MT.15.GGA1.ENST00000343632.4.missense.484P/A -SLLHTVSPEPARPPQQPVPTE >WT.7.CLTCL1.ENST00000263200.10.missense.1469H/N NNKSVNEALNHLLTEEEDYQG ->MT.7.CLTCL1.ENST00000263200.10.missense.1469H/N +>MT.7.CLTCL1.ENST00000263200.10.missense.1469H/N {"Best Peptide": "SVNEALNNL"} NNKSVNEALNNLLTEEEDYQG ->WT.19.ACO2.ENST00000216254.4.missense.510E/Q -AIAGTLKFNPETDYLTGTDGK ->MT.19.ACO2.ENST00000216254.4.missense.510E/Q -AIAGTLKFNPQTDYLTGTDGK ->WT.18.ACO2.ENST00000216254.4.missense.33A/E -ASVLCQRAKVAMSHFEPNEYI ->MT.18.ACO2.ENST00000216254.4.missense.33A/E -ASVLCQRAKVEMSHFEPNEYI ->WT.11.TPST2.ENST00000338754.4.missense.274P/H -VLHHEDLIGKPGGVSLSKIER ->MT.11.TPST2.ENST00000338754.4.missense.274P/H -VLHHEDLIGKHGGVSLSKIER ->WT.8.FAM230A.ENST00000434783.3.missense.322E/Q -KEDAVQGIANEDAAQGIAKED ->MT.8.FAM230A.ENST00000434783.3.missense.322E/Q -KEDAVQGIANQDAAQGIAKED >WT.24.PPP6R2.ENST00000395741.3.missense.414S/Y AREERTEASGSESRVEPPHEN ->MT.24.PPP6R2.ENST00000395741.3.missense.414S/Y +>MT.24.PPP6R2.ENST00000395741.3.missense.414S/Y {"Best Peptide": "GYESRVEPP"} AREERTEASGYESRVEPPHEN ->WT.9.IGLV6-57.ENST00000390285.3.missense.43R/G -PGKTVTISCTRSSGSIASNYV ->MT.9.IGLV6-57.ENST00000390285.3.missense.43R/G -PGKTVTISCTGSSGSIASNYV ->WT.5.CECR2.ENST00000262608.8.missense.535R/H -GRSGGSHVWTRSRDPEGSSRK ->MT.5.CECR2.ENST00000262608.8.missense.535R/H -GRSGGSHVWTHSRDPEGSSRK +>WT.8.FAM230A.ENST00000434783.3.missense.322E/Q +KEDAVQGIANEDAAQGIAKED +>MT.8.FAM230A.ENST00000434783.3.missense.322E/Q {"Best Peptide": "ANQDAAQGI"} +KEDAVQGIANQDAAQGIAKED +>WT.11.TPST2.ENST00000338754.4.missense.274P/H +VLHHEDLIGKPGGVSLSKIER +>MT.11.TPST2.ENST00000338754.4.missense.274P/H {"Best Peptide": "KHGGVSLSK"} +VLHHEDLIGKHGGVSLSKIER >WT.12.NEFH.ENST00000310624.6.missense.830P/T VKSPVKEEEKPQEVKVKEPPK ->MT.12.NEFH.ENST00000310624.6.missense.830P/T +>MT.12.NEFH.ENST00000310624.6.missense.830P/T {"Best Peptide": "KTQEVKVKE"} VKSPVKEEEKTQEVKVKEPPK ->WT.14.LGALS2.ENST00000215886.4.missense.132E/Q -SHLSYLSVRGGFNMSSFKLKE ->MT.14.LGALS2.ENST00000215886.4.missense.132E/Q -SHLSYLSVRGGFNMSSFKLKQ +>WT.16.TRIOBP.ENST00000406386.3.FS.219GA/G +EDTGGGGRSAGQHWARLRGE +>MT.16.TRIOBP.ENST00000406386.3.FS.219GA/G {"Best Peptide": "GEKAGCPWS"} +EDTGGGGRSAGSTGQGSGEKAGCPWSGTGQH +>WT.9.IGLV6-57.ENST00000390285.3.missense.43R/G +PGKTVTISCTRSSGSIASNYV +>MT.9.IGLV6-57.ENST00000390285.3.missense.43R/G {"Best Peptide": "KTVTISCTG"} +PGKTVTISCTGSSGSIASNYV >WT.21.MOV10L1.ENST00000262794.5.missense.482A/T TSAKTTVVVTAQKRNSRRQLP ->MT.21.MOV10L1.ENST00000262794.5.missense.482A/T +>MT.21.MOV10L1.ENST00000262794.5.missense.482A/T {"Best Peptide": "KTTVVVTTQ"} TSAKTTVVVTTQKRNSRRQLP +>WT.2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A +DPASAAAAAAAAAAAAAAVIPTVSTPPP +>MT.2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A {"Best Peptide": "SAAAAAAAV"} +DPASAAAAAAAVIPTVSTPPP diff --git a/tests/test_data/pvacvector/Test.vector.prevent_clipping_best_peptide.input.fa b/tests/test_data/pvacvector/Test.vector.prevent_clipping_best_peptide.input.fa new file mode 100644 index 000000000..04b39b186 --- /dev/null +++ b/tests/test_data/pvacvector/Test.vector.prevent_clipping_best_peptide.input.fa @@ -0,0 +1,16 @@ +>MT.20.PKDREJ.ENST00000253255.5.missense.1875T/I {"Best Peptide": "LYYSYGLLHI"} +WLYYSYGLLHIYGSGGYALYF +>MT.15.GGA1.ENST00000343632.4.missense.484P/A {"Best Peptide": "ARPPQQPVP"} +SLLHTVSPEPARPPQQPVPTE +>MT.17.CACNA1I.ENST00000402142.3.missense.107C/Y {"Best Peptide": "YQPCDDMDY"} +GMYQPCDDMDYLSDRCKILQV +>MT.13.ELFN2.ENST00000402918.2.missense.186P/L {"Best Peptide": "MVCELAGNL"} +SLMVCELAGNLFNCECDLFGF +>MT.14.LGALS2.ENST00000215886.4.missense.132E/Q {"Best Peptide": "NMSSFKLKQ"} +SHLSYLSVRGGFNMSSFKLKQ +>MT.18.ACO2.ENST00000216254.4.missense.33A/E {"Best Peptide": "EMSHFEPNE"} +ASVLCQRAKVEMSHFEPNEYI +>MT.23.TUBGCP6.ENST00000248846.5.missense.220H/R {"Best Peptide": "RSRTYDMDV"} +TRVSLFGALVRSRTYDMDVRL +>MT.9.IGLV6-57.ENST00000390285.3.missense.43R/G {"Best Peptide": "KTVTISCTG"} +PGKTVTISCTGSSGSIASNYV diff --git a/tests/test_output_parser.py b/tests/test_output_parser.py index df3bc4843..be3b42970 100644 --- a/tests/test_output_parser.py +++ b/tests/test_output_parser.py @@ -437,7 +437,4 @@ def test_parse_output_runs_and_produces_expeceted_output_for_complex_inframe_ins self.assertFalse(parser.execute()) expected_output_file = os.path.join(self.test_data_dir, "complex_inframe_insertion", "output.iedb.parsed.tsv") - import shutil - shutil.copy(parse_output_output_file.name, expected_output_file) - self.assertTrue(compare(parse_output_output_file.name, expected_output_file)) diff --git a/tests/test_pvacvector.py b/tests/test_pvacvector.py index 1a55d1e81..ae9b40d74 100644 --- a/tests/test_pvacvector.py +++ b/tests/test_pvacvector.py @@ -424,3 +424,41 @@ def test_pvacvector_remove_peptides(self): )) output_dir.cleanup() + + def test_prevent_clipping_best_peptide(self): + output_dir = tempfile.TemporaryDirectory() + input_file = os.path.join(self.test_data_dir, 'Test.vector.prevent_clipping_best_peptide.input.fa') + + with self.assertLogs(level='INFO') as log: + run.main([ + input_file, + self.test_run_name, + self.allele, + self.method, + output_dir.name, + '-e1', self.epitope_length, + '-n', self.input_n_mer, + '-k', + '-b', '22000', + '--spacers', 'None', + ]) + self.assertIn("INFO:root:Clipping 1 amino acids off the end of peptide MT.14.LGALS2.ENST00000215886.4.missense.132E/Q would clip the best peptide. Skipping.", log.output) + self.assertIn("INFO:root:Clipping 2 amino acids off the start of peptide MT.20.PKDREJ.ENST00000253255.5.missense.1875T/I would clip the best peptide. Skipping.", log.output) + self.assertIn("INFO:root:Clipping 2 amino acids off the end of peptide MT.14.LGALS2.ENST00000215886.4.missense.132E/Q would clip the best peptide. Skipping.", log.output) + + best_peptides = [ + "LYYSYGLLHI", + "ARPPQQPVP", + "YQPCDDMDY", + "MVCELAGNL", + "NMSSFKLKQ", + "EMSHFEPNE", + "RSRTYDMDV", + "KTVTISCTG" + ] + with open(os.path.join(output_dir.name, "test_pvacvector_produces_expected_output_results.fa"), "r") as file: + file_content = file.read() + for best_peptide in best_peptides: + self.assertIn(best_peptide, file_content) + + output_dir.cleanup()