Prevent pVACvector from clipping Best Peptide

griffithlab · susannasiebert · Jan 15, 2025 · Jan 7, 2025 · Jan 15, 2025 · Jan 7, 2025
commit 9d316637325faa8bab8219410190b5e7caab6c62
diff --git a/pvactools/lib/fasta_generator.py b/pvactools/lib/fasta_generator.py
@@ -7,6 +7,8 @@
 from abc import ABCMeta
 from Bio import SeqIO
 import itertools
+import logging
+
 from pvactools.lib.proximal_variant import ProximalVariant
 
 csv.field_size_limit(sys.maxsize)
@@ -364,13 +366,21 @@ def __init__(self, **kwargs):
 
     def execute(self):
         seq_dict = dict()
+        best_peptide = dict()
         for record in SeqIO.parse(self.input_file, "fasta"):
             seq_dict[record.id] = str(record.seq)
+            description = record.description.replace("{} ".format(record.id), "")
+            if description != "":
+               try:
+                   best_peptide[record.id] = json.loads(description)['Best Peptide']
+               except:
+                   pass
 
         for length in self.epitope_lengths:
             epitopes = dict()
             fasta_sequences = OrderedDict()
             wingspan_length = length - 1
+            warnings = set()
             for (seq1, seq2) in self.junctions_to_test:
                 seq1_seq = seq_dict[seq1]
                 seq2_seq = seq_dict[seq2]
@@ -379,6 +389,19 @@ def execute(self):
                         #These combinations would've already been tested in previous attempts with lower clip lengths and can be skipped
                         if left_clip_length < self.clip_length and right_clip_length < self.clip_length:
                             continue
+                        if seq1 in best_peptide:
+                            seq1_best_peptide = best_peptide[seq1]
+                            last_position = seq1_seq.rindex(seq1_best_peptide) + len(seq1_best_peptide)
+                            end_distance = len(seq1_seq) - last_position
+                            if left_clip_length > end_distance:
+                                warnings.add("Clipping {} amino acids off the end of peptide {} would clip the best peptide. Skipping.".format(left_clip_length, seq1))
+                                continue
+                        if seq2 in best_peptide:
+                            seq2_best_peptide = best_peptide[seq2]
+                            first_position = seq2_seq.index(seq2_best_peptide)
+                            if right_clip_length > first_position:
+                                warnings.add("Clipping {} amino acids off the start of peptide {} would clip the best peptide. Skipping.".format(right_clip_length, seq2))
+                                continue
                         trunc_seq1 = seq1_seq[(len(seq1_seq) - wingspan_length):(len(seq1_seq) - left_clip_length)]
                         trunc_seq2 = seq2_seq[(0 + right_clip_length):wingspan_length]
 
@@ -388,6 +411,8 @@ def execute(self):
                         else:
                             seq_ID = "{}|{}|{}|{}".format(seq1, left_clip_length, right_clip_length, seq2)
                             epitopes[seq_ID] = trunc_seq1 + trunc_seq2
+            for warning in list(warnings):
+                logging.info(warning)
 
             for seq_id in epitopes:
                 sequence = epitopes[seq_id]
@@ -409,16 +434,3 @@ def execute(self):
 
             writer.close()
             key_writer.close()
-
-    def combine_problematic_peptides(self, seq_dict):
-        seq_tuples = []
-        for (seq_id, data) in seq_dict.items():
-            other_seq_ids = list(seq_dict.keys())
-            other_seq_ids.remove(seq_id)
-            if data['problematic_start']:
-                for other_seq_id in other_seq_ids:
-                    seq_tuples.append((other_seq_id, seq_id))
-            if data['problematic_end']:
-                for other_seq_id in other_seq_ids:
-                    seq_tuples.append((seq_id, other_seq_id))
-        return list(set(seq_tuples))
diff --git a/pvactools/tools/pvacseq/generate_protein_fasta.py b/pvactools/tools/pvacseq/generate_protein_fasta.py
@@ -6,6 +6,7 @@
 import yaml
 import csv
 import re
+import json
 from collections import OrderedDict
 from Bio import SeqIO
 from Bio.Seq import Seq
@@ -125,14 +126,15 @@ def generate_fasta(flanking_sequence_length, downstream_sequence_length, temp_di
 def parse_input_tsv(input_tsv):
     if input_tsv is None:
         return (None, None)
+    indexes = []
     with open(input_tsv, 'r') as fh:
         reader = csv.DictReader(fh, delimiter = "\t")
-        if 'Index' in reader.fieldnames:
-            indexes = parse_full_input_tsv(reader)
-            file_type = 'full'
-        else:
+        if 'Best Peptide' in reader.fieldnames:
             indexes = parse_aggregated_input_tsv(reader)
             file_type = 'aggregated'
+        else:
+            indexes = parse_full_input_tsv(reader)
+            file_type = 'full'
     return (indexes, file_type)
 
 def parse_full_input_tsv(reader):
@@ -142,9 +144,9 @@ def parse_full_input_tsv(reader):
     return indexes
 
 def parse_aggregated_input_tsv(reader):
-    indexes = []
+    indexes = {}
     for line in reader:
-        indexes.append(line)
+        indexes[line['Index']] = line
     return indexes
 
 def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_evaluation):
@@ -155,7 +157,7 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
     with open(fasta_key_file_path, 'r') as fasta_key_file:
         keys = yaml.load(fasta_key_file, Loader=yaml.FullLoader)
 
-    (tsv_indexes, tsv_file_type) = parse_input_tsv(input_tsv)
+    (tsv_indexes, file_type) = parse_input_tsv(input_tsv)
 
     dataframe = OrderedDict()
     output_records = []
@@ -165,33 +167,28 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
             if mutant_only and record_id.startswith('WT.'):
                 continue
             if tsv_indexes is not None:
-                if tsv_file_type == 'full':
-                    sequence_type, index = record_id.split('.', 1)
-                    if index not in tsv_indexes:
-                        continue
-                else:
-                    (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
-                    (peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
-                    (parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
-                    matches = [i for i in tsv_indexes if i['Best Transcript'] == transcript and i['AA Change'] == parsed_aa_change and i['Evaluation'] in aggregate_report_evaluation]
-                    if len(matches) == 0:
-                        continue
-            new_record = SeqRecord(record.seq, id=record_id, description=record_id)
-            output_records.append(new_record)
+                sequence_type, index = record_id.split('.', 1)
+                if file_type == 'full':
+                    if index in tsv_indexes:
+                        new_record = SeqRecord(record.seq, id=record_id, description=record_id)
+                        output_records.append(new_record)
+                elif file_type == 'aggregated':
+                    if index in tsv_indexes.keys() and tsv_indexes[index]['Evaluation'] in aggregate_report_evaluation:
+                        if record_id.startswith('MT.'):
+                            annotations = { 'Best Peptide': tsv_indexes[index]['Best Peptide'] }
+                            new_record = SeqRecord(record.seq, id=record_id, description=json.dumps(annotations))
+                        else:
+                            new_record = SeqRecord(record.seq, id=record_id, description=record_id)
+                        output_records.append(new_record)
+            else:
+                new_record = SeqRecord(record.seq, id=record_id, description=record_id)
+                output_records.append(new_record)
 
     if tsv_indexes is not None:
         ordered_output_records = []
         for tsv_index in tsv_indexes:
-            if tsv_file_type == 'full':
-                records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index]
-                ordered_output_records.extend(records)
-            else:
-                for output_record in output_records:
-                    (rest_record_id, variant_type, aa_change) = output_record.id.rsplit(".", 2)
-                    (peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
-                    (parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
-                    if tsv_index['Best Transcript'] == transcript and tsv_index['AA Change'] == parsed_aa_change:
-                        ordered_output_records.append(output_record)
+            records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index]
+            ordered_output_records.extend(records)
         output_records = ordered_output_records
 
     SeqIO.write(output_records, output_file, "fasta")

diff --git a/tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv b/tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv
@@ -1,24 +1,24 @@
-ID	A*02:01	B*35:01	Gene	AA Change	Num Passing Transcripts	Best Peptide	Best Transcript	TSL	Allele	Pos	Prob Pos	Num Passing Peptides	IC50 MT	IC50 WT	%ile MT	%ile WT	RNA Expr	RNA VAF	Allele Expr	RNA Depth	DNA VAF	Tier	Evaluation
-2-217498305-217498305-T-TGCTGCC	9	4	IGFBP2	L20LLP	1	PLLPLLPLL	ENST00000233809.4	Not Supported	HLA-A*02:01	7	None	12	129.750	221.227	1.400	2.5	NA	NA	NA	NA	0.891	Pass	Pending
-22-38119219-38119220-GA-G		1	TRIOBP	FS219	1	CPWSGTGQH	ENST00000406386.3	Not Supported	HLA-B*35:01	0	None	1	212.962	NA	1.500	NA	NA	NA	NA	NA	0.768	Pass	Pending
-22-50615580-50615581-C-T	6	3	PANX2	S147F	1	ALGWEFLAFT	ENST00000395842.2	Not Supported	HLA-A*02:01	9	None	7	168.895	227.61	1.693	1.707	NA	NA	NA	NA	0.959	Anchor	Pending
-22-39994238-39994239-G-A	3	3	CACNA1I	C107Y	1	YLSDRCKIL	ENST00000402142.3	Not Supported	HLA-A*02:01	1	None	6	134.880	2639.458	2.100	8.2	NA	NA	NA	NA	0.043	Subclonal	Pending
-6-41754573-41754573-C-CCTT	4		PRICKLE4	-287-288L	1	TLSRTLLLAA	ENST00000458694.1	Not Supported	HLA-A*02:01	8	None	4	312.720	332.88	2.900	2.8	NA	NA	NA	NA	0.158	Subclonal	Pending
-22-37771017-37771018-G-A	5	3	ELFN2	P186L	1	NLFNCECDL	ENST00000402918.2	Not Supported	HLA-A*02:01	2	None	7	387.055	27322.265	2.000	40.0	NA	NA	NA	NA	0.135	Subclonal	Pending
-22-18644672-18644673-C-T	2	3	USP18	A124V	1	RPLELVYCL	ENST00000215794.7	Not Supported	HLA-B*35:01	6	None	5	471.710	399.275	1.100	0.9	NA	NA	NA	NA	0.053	Subclonal	Pending
-22-46653595-46653596-G-A	3	4	PKDREJ	T1875I	1	YSYGLLHIY	ENST00000253255.5	Not Supported	HLA-B*35:01	8	None	6	69.335	39.921	0.890	0.43	NA	NA	NA	NA	0.233	Poor	Pending
-4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A	2	1	RBM47	AAAAAAAA495-502A	1	SAAAAAAAV	ENST00000381793.2	Not Supported	HLA-B*35:01	9	None	2	508.630	427.517	2.600	2.5	NA	NA	NA	NA	0.977	Poor	Pending
-22-22550509-22550510-T-G		2	IGLV6-57	S63A	1	GSAPTTVIY	ENST00000390285.3	Not Supported	HLA-B*35:01	3	None	2	567.080	901.875	3.400	4.1	NA	NA	NA	NA	0.571	Poor	Pending
-22-50682229-50682230-T-C	2	3	TUBGCP6	H220R	1	SLFGALVRS	ENST00000248846.5	Not Supported	HLA-A*02:01	8	None	5	623.290	393.78	4.398	3.6	NA	NA	NA	NA	0.686	Poor	Pending
-22-38027027-38027028-C-G	1	1	GGA1	P484A	1	LLHTVSPEPA	ENST00000343632.4	Not Supported	HLA-A*02:01	10	None	2	723.710	7116.565	3.000	18.0	NA	NA	NA	NA	0.486	Poor	Pending
-22-19175521-19175522-G-T	2		CLTCL1	H1469N	1	SVNEALNNL	ENST00000263200.10	Not Supported	HLA-A*02:01	8	None	2	1010.045	793.83	4.898	4.9	NA	NA	NA	NA	0.100	Poor	Pending
-22-41920894-41920895-G-C		1	ACO2	E510Q	1	NPQTDYLTG	ENST00000216254.4	Not Supported	HLA-B*35:01	3	None	1	3121.315	3595.485	6.602	6.5	NA	NA	NA	NA	0.250	Poor	Pending
-22-41895790-41895791-C-A		1	ACO2	A33E	1	EMSHFEPNEY	ENST00000216254.4	Not Supported	HLA-B*35:01	1	None	1	3903.700	4456.03	2.600	3.7	NA	NA	NA	NA	0.044	Poor	Pending
-22-26936775-26936776-G-T	1		TPST2	P274H	1	DLIGKHGGV	ENST00000338754.4	Not Supported	HLA-A*02:01	6	None	1	4145.685	3402.989	13.000	7.7	NA	NA	NA	NA	0.179	Poor	Pending
-22-20709231-20709232-G-C			FAM230A	E322Q	0	IANQDAAQG	ENST00000434783.3	Not Supported	HLA-B*35:01	4	None	0	5738.350	5541.375	10.500	7.674	NA	NA	NA	NA	0.500	Poor	Pending
-22-50869713-50869714-C-A			PPP6R2	S414Y	0	YESRVEPPH	ENST00000395741.3	Not Supported	HLA-B*35:01	1	None	0	6159.300	18030.513	14.000	33.0	NA	NA	NA	NA	0.043	Poor	Pending
-22-22550449-22550450-C-G			IGLV6-57	R43G	0	ISCTGSSGSI	ENST00000390285.3	Not Supported	HLA-A*02:01	5	None	0	9672.631	20663.37	30.000	34.0	NA	NA	NA	NA	1.000	Poor	Pending
-22-18020271-18020272-G-A			CECR2	R535H	0	SGGSHVWTH	ENST00000262608.8	Not Supported	HLA-B*35:01	9	None	0	10761.970	30462.795	24.000	42.0	NA	NA	NA	NA	0.071	Poor	Pending
-22-29886116-29886117-C-A			NEFH	P830T	0	SPVKEEEKT	ENST00000310624.6	Not Supported	HLA-B*35:01	9	None	0	12628.540	17588.29	26.000	32.0	NA	NA	NA	NA	0.038	Poor	Pending
-22-37966274-37966275-C-G			LGALS2	E132Q	0	FNMSSFKLKQ	ENST00000215886.4	Not Supported	HLA-A*02:01	10	None	0	16963.785	16973.74	33.000	39.0	NA	NA	NA	NA	0.496	Poor	Pending
-22-50555769-50555770-G-A			MOV10L1	A482T	0	SAKTTVVVTT	ENST00000262794.5	Not Supported	HLA-A*02:01	10	None	0	27206.914	17665.8	40.000	26.0	NA	NA	NA	NA	0.042	Poor	Pending
+ID	Index	E*01:01	G*01:09	Gene	AA Change	Num Passing Transcripts	Best Peptide	Best Transcript	TSL	Allele	Pos	Prob Pos	Num Passing Peptides	IC50 MT	IC50 WT	%ile MT	%ile WT	RNA Expr	RNA VAF	Allele Expr	RNA Depth	DNA VAF	Tier	Evaluation
+22-41920894-41920895-G-C	19.ACO2.ENST00000216254.4.missense.510E/Q	2	1	ACO2	E510Q	1	KFNPQTDYL	ENST00000216254.4	Not Supported	HLA-G*01:09	5	None	3	1262.760	1318.61	0.500	0.6	NA	NA	NA	NA	0.250	Poor	Pending
+22-22550509-22550510-T-G	10.IGLV6-57.ENST00000390285.3.missense.63S/A	2	1	IGLV6-57	S63A	1	QRPGSAPTT	ENST00000390285.3	Not Supported	HLA-E*01:01	6	None	3	1362.110	1517.76	0.300	0.3	NA	NA	NA	NA	0.571	Poor	Pending
+22-46653595-46653596-G-A	20.PKDREJ.ENST00000253255.5.missense.1875T/I	1	5	PKDREJ	T1875I	1	LYYSYGLLHI	ENST00000253255.5	Not Supported	HLA-G*01:09	10	None	6	1469.280	2365.16	0.300	0.4	NA	NA	NA	NA	0.233	Poor	Pending
+22-38027027-38027028-C-G	15.GGA1.ENST00000343632.4.missense.484P/A	2		GGA1	P484A	1	ARPPQQPVP	ENST00000343632.4	Not Supported	HLA-E*01:01	1	None	2	1654.990	4242.33	0.400	4.8	NA	NA	NA	NA	0.486	Poor	Pending
+22-39994238-39994239-G-A	17.CACNA1I.ENST00000402142.3.missense.107C/Y	2	1	CACNA1I	C107Y	1	YQPCDDMDY	ENST00000402142.3	Not Supported	HLA-E*01:01	9	None	3	1864.160	1804.62	0.600	0.6	NA	NA	NA	NA	0.043	Poor	Pending
+6-41754573-41754573-C-CCTT	3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L	1	1	PRICKLE4	-287-288L	1	ATLSRTLLL	ENST00000458694.1	Not Supported	HLA-E*01:01	9	None	1	2122.610	3272.12	0.120	5.1	NA	NA	NA	NA	0.158	Poor	Pending
+22-18020271-18020272-G-A	5.CECR2.ENST00000262608.8.missense.535R/H	1		CECR2	R535H	1	WTHSRDPEG	ENST00000262608.8	Not Supported	HLA-E*01:01	3	None	1	2523.800	1765.99	1.400	0.5	NA	NA	NA	NA	0.071	Poor	Pending
+2-217498305-217498305-T-TGCTGCC	1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP	2	1	IGFBP2	L20LLP	1	LLPLLPLLL	ENST00000233809.4	Not Supported	HLA-E*01:01	6-7	None	2	2551.250	3099.81	0.170	0.33	NA	NA	NA	NA	0.891	Poor	Pending
+22-18644672-18644673-C-T	6.USP18.ENST00000215794.7.missense.124A/V		1	USP18	A124V	1	LVYCLQKCN	ENST00000215794.7	Not Supported	HLA-G*01:09	2	None	1	3099.810	6399.79	3.301	12.0	NA	NA	NA	NA	0.053	Poor	Pending
+22-50615580-50615581-C-T	22.PANX2.ENST00000395842.2.missense.147S/F	1	1	PANX2	S147F	1	FLAFTRLTS	ENST00000395842.2	Not Supported	HLA-E*01:01	4	None	2	3343.700	3202.08	2.699	2.4	NA	NA	NA	NA	0.959	Poor	Pending
+22-37771017-37771018-G-A	13.ELFN2.ENST00000402918.2.missense.186P/L		1	ELFN2	P186L	1	MVCELAGNL	ENST00000402918.2	Not Supported	HLA-G*01:09	9	None	1	3454.020	10191.11	4.102	21.0	NA	NA	NA	NA	0.135	Poor	Pending
+22-37966274-37966275-C-G	14.LGALS2.ENST00000215886.4.missense.132E/Q	1		LGALS2	E132Q	1	NMSSFKLKQ	ENST00000215886.4	Not Supported	HLA-E*01:01	9	None	1	3890.570	3848.7	4.000	3.9	NA	NA	NA	NA	0.496	Poor	Pending
+22-41895790-41895791-C-A	18.ACO2.ENST00000216254.4.missense.33A/E	2		ACO2	A33E	1	EMSHFEPNE	ENST00000216254.4	Not Supported	HLA-E*01:01	1	None	2	3932.890	2443.19	4.199	1.3	NA	NA	NA	NA	0.044	Poor	Pending
+22-50682229-50682230-T-C	23.TUBGCP6.ENST00000248846.5.missense.220H/R	1	1	TUBGCP6	H220R	1	RSRTYDMDV	ENST00000248846.5	Not Supported	HLA-G*01:09	1	None	2	4576.120	7609.38	6.301	15.0	NA	NA	NA	NA	0.686	Poor	Pending
+22-19175521-19175522-G-T	7.CLTCL1.ENST00000263200.10.missense.1469H/N		1	CLTCL1	H1469N	1	SVNEALNNL	ENST00000263200.10	Not Supported	HLA-G*01:09	8	None	1	4989.870	7775.84	7.301	15.0	NA	NA	NA	NA	0.100	Poor	Pending
+22-50869713-50869714-C-A	24.PPP6R2.ENST00000395741.3.missense.414S/Y			PPP6R2	S414Y	0	GYESRVEPP	ENST00000395741.3	Not Supported	HLA-G*01:09	2	None	0	5620.540	22209.93	8.703	44.0	NA	NA	NA	NA	0.043	Poor	Pending
+22-20709231-20709232-G-C	8.FAM230A.ENST00000434783.3.missense.322E/Q			FAM230A	E322Q	0	ANQDAAQGI	ENST00000434783.3	Not Supported	HLA-G*01:09	3	None	0	5681.680	9654.43	8.797	20.0	NA	NA	NA	NA	0.500	Poor	Pending
+22-26936775-26936776-G-T	11.TPST2.ENST00000338754.4.missense.274P/H			TPST2	P274H	0	KHGGVSLSK	ENST00000338754.4	Not Supported	HLA-E*01:01	2	None	0	6539.790	20149.14	13.000	53.0	NA	NA	NA	NA	0.179	Poor	Pending
+22-29886116-29886117-C-A	12.NEFH.ENST00000310624.6.missense.830P/T			NEFH	P830T	0	KTQEVKVKE	ENST00000310624.6	Not Supported	HLA-G*01:09	2	None	0	7366.350	25842.4	14.000	50.0	NA	NA	NA	NA	0.038	Poor	Pending
+22-38119219-38119220-GA-G	16.TRIOBP.ENST00000406386.3.FS.219GA/G			TRIOBP	FS219	0	GEKAGCPWS	ENST00000406386.3	Not Supported	HLA-E*01:01	0-10	None	0	8119.760	NA	18.000	NA	NA	NA	NA	NA	0.768	Poor	Pending
+22-22550449-22550450-C-G	9.IGLV6-57.ENST00000390285.3.missense.43R/G			IGLV6-57	R43G	0	KTVTISCTG	ENST00000390285.3	Not Supported	HLA-G*01:09	9	None	0	9447.760	15208.39	19.000	31.0	NA	NA	NA	NA	1.000	Poor	Pending
+22-50555769-50555770-G-A	21.MOV10L1.ENST00000262794.5.missense.482A/T			MOV10L1	A482T	0	KTTVVVTTQ	ENST00000262794.5	Not Supported	HLA-G*01:09	8	None	0	10874.650	16763.86	23.000	34.0	NA	NA	NA	NA	0.042	Poor	Pending
+4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A	2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A			RBM47	AAAAAAAA495-502A	0	SAAAAAAAV	ENST00000381793.2	Not Supported	HLA-E*01:01	8-9	None	0	21040.320	21040.32	21.000	30.0	NA	NA	NA	NA	0.977	Poor	Pending