Skip to content

Commit

Permalink
Merge pull request #1174 from griffithlab/pvacvector_clipping
Browse files Browse the repository at this point in the history
Prevent pVACvector from clipping Best Peptide
  • Loading branch information
susannasiebert authored Jan 15, 2025
2 parents c8c2f70 + c72af6a commit 6f19436
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 138 deletions.
38 changes: 25 additions & 13 deletions pvactools/lib/fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from abc import ABCMeta
from Bio import SeqIO
import itertools
import logging

from pvactools.lib.proximal_variant import ProximalVariant

csv.field_size_limit(sys.maxsize)
Expand Down Expand Up @@ -364,13 +366,21 @@ def __init__(self, **kwargs):

def execute(self):
seq_dict = dict()
best_peptide = dict()
for record in SeqIO.parse(self.input_file, "fasta"):
seq_dict[record.id] = str(record.seq)
description = record.description.replace("{} ".format(record.id), "")
if description != "":
try:
best_peptide[record.id] = json.loads(description)['Best Peptide']
except:
pass

for length in self.epitope_lengths:
epitopes = dict()
fasta_sequences = OrderedDict()
wingspan_length = length - 1
warnings = set()
for (seq1, seq2) in self.junctions_to_test:
seq1_seq = seq_dict[seq1]
seq2_seq = seq_dict[seq2]
Expand All @@ -379,6 +389,19 @@ def execute(self):
#These combinations would've already been tested in previous attempts with lower clip lengths and can be skipped
if left_clip_length < self.clip_length and right_clip_length < self.clip_length:
continue
if seq1 in best_peptide:
seq1_best_peptide = best_peptide[seq1]
last_position = seq1_seq.rindex(seq1_best_peptide) + len(seq1_best_peptide)
end_distance = len(seq1_seq) - last_position
if left_clip_length > end_distance:
warnings.add("Clipping {} amino acids off the end of peptide {} would clip the best peptide. Skipping.".format(left_clip_length, seq1))
continue
if seq2 in best_peptide:
seq2_best_peptide = best_peptide[seq2]
first_position = seq2_seq.index(seq2_best_peptide)
if right_clip_length > first_position:
warnings.add("Clipping {} amino acids off the start of peptide {} would clip the best peptide. Skipping.".format(right_clip_length, seq2))
continue
trunc_seq1 = seq1_seq[(len(seq1_seq) - wingspan_length):(len(seq1_seq) - left_clip_length)]
trunc_seq2 = seq2_seq[(0 + right_clip_length):wingspan_length]

Expand All @@ -388,6 +411,8 @@ def execute(self):
else:
seq_ID = "{}|{}|{}|{}".format(seq1, left_clip_length, right_clip_length, seq2)
epitopes[seq_ID] = trunc_seq1 + trunc_seq2
for warning in list(warnings):
logging.info(warning)

for seq_id in epitopes:
sequence = epitopes[seq_id]
Expand All @@ -409,16 +434,3 @@ def execute(self):

writer.close()
key_writer.close()

def combine_problematic_peptides(self, seq_dict):
seq_tuples = []
for (seq_id, data) in seq_dict.items():
other_seq_ids = list(seq_dict.keys())
other_seq_ids.remove(seq_id)
if data['problematic_start']:
for other_seq_id in other_seq_ids:
seq_tuples.append((other_seq_id, seq_id))
if data['problematic_end']:
for other_seq_id in other_seq_ids:
seq_tuples.append((seq_id, other_seq_id))
return list(set(seq_tuples))
57 changes: 27 additions & 30 deletions pvactools/tools/pvacseq/generate_protein_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import yaml
import csv
import re
import json
from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
Expand Down Expand Up @@ -125,14 +126,15 @@ def generate_fasta(flanking_sequence_length, downstream_sequence_length, temp_di
def parse_input_tsv(input_tsv):
if input_tsv is None:
return (None, None)
indexes = []
with open(input_tsv, 'r') as fh:
reader = csv.DictReader(fh, delimiter = "\t")
if 'Index' in reader.fieldnames:
indexes = parse_full_input_tsv(reader)
file_type = 'full'
else:
if 'Best Peptide' in reader.fieldnames:
indexes = parse_aggregated_input_tsv(reader)
file_type = 'aggregated'
else:
indexes = parse_full_input_tsv(reader)
file_type = 'full'
return (indexes, file_type)

def parse_full_input_tsv(reader):
Expand All @@ -142,9 +144,9 @@ def parse_full_input_tsv(reader):
return indexes

def parse_aggregated_input_tsv(reader):
indexes = []
indexes = {}
for line in reader:
indexes.append(line)
indexes[line['Index']] = line
return indexes

def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_evaluation):
Expand All @@ -155,7 +157,7 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
with open(fasta_key_file_path, 'r') as fasta_key_file:
keys = yaml.load(fasta_key_file, Loader=yaml.FullLoader)

(tsv_indexes, tsv_file_type) = parse_input_tsv(input_tsv)
(tsv_indexes, file_type) = parse_input_tsv(input_tsv)

dataframe = OrderedDict()
output_records = []
Expand All @@ -165,33 +167,28 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
if mutant_only and record_id.startswith('WT.'):
continue
if tsv_indexes is not None:
if tsv_file_type == 'full':
sequence_type, index = record_id.split('.', 1)
if index not in tsv_indexes:
continue
else:
(rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
(peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
(parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
matches = [i for i in tsv_indexes if i['Best Transcript'] == transcript and i['AA Change'] == parsed_aa_change and i['Evaluation'] in aggregate_report_evaluation]
if len(matches) == 0:
continue
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)
sequence_type, index = record_id.split('.', 1)
if file_type == 'full':
if index in tsv_indexes:
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)
elif file_type == 'aggregated':
if index in tsv_indexes.keys() and tsv_indexes[index]['Evaluation'] in aggregate_report_evaluation:
if record_id.startswith('MT.'):
annotations = { 'Best Peptide': tsv_indexes[index]['Best Peptide'] }
new_record = SeqRecord(record.seq, id=record_id, description=json.dumps(annotations))
else:
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)
else:
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)

if tsv_indexes is not None:
ordered_output_records = []
for tsv_index in tsv_indexes:
if tsv_file_type == 'full':
records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index]
ordered_output_records.extend(records)
else:
for output_record in output_records:
(rest_record_id, variant_type, aa_change) = output_record.id.rsplit(".", 2)
(peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
(parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
if tsv_index['Best Transcript'] == transcript and tsv_index['AA Change'] == parsed_aa_change:
ordered_output_records.append(output_record)
records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index]
ordered_output_records.extend(records)
output_records = ordered_output_records

SeqIO.write(output_records, output_file, "fasta")
Expand Down
48 changes: 24 additions & 24 deletions tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
ID A*02:01 B*35:01 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
2-217498305-217498305-T-TGCTGCC 9 4 IGFBP2 L20LLP 1 PLLPLLPLL ENST00000233809.4 Not Supported HLA-A*02:01 7 None 12 129.750 221.227 1.400 2.5 NA NA NA NA 0.891 Pass Pending
22-38119219-38119220-GA-G 1 TRIOBP FS219 1 CPWSGTGQH ENST00000406386.3 Not Supported HLA-B*35:01 0 None 1 212.962 NA 1.500 NA NA NA NA NA 0.768 Pass Pending
22-50615580-50615581-C-T 6 3 PANX2 S147F 1 ALGWEFLAFT ENST00000395842.2 Not Supported HLA-A*02:01 9 None 7 168.895 227.61 1.693 1.707 NA NA NA NA 0.959 Anchor Pending
22-39994238-39994239-G-A 3 3 CACNA1I C107Y 1 YLSDRCKIL ENST00000402142.3 Not Supported HLA-A*02:01 1 None 6 134.880 2639.458 2.100 8.2 NA NA NA NA 0.043 Subclonal Pending
6-41754573-41754573-C-CCTT 4 PRICKLE4 -287-288L 1 TLSRTLLLAA ENST00000458694.1 Not Supported HLA-A*02:01 8 None 4 312.720 332.88 2.900 2.8 NA NA NA NA 0.158 Subclonal Pending
22-37771017-37771018-G-A 5 3 ELFN2 P186L 1 NLFNCECDL ENST00000402918.2 Not Supported HLA-A*02:01 2 None 7 387.055 27322.265 2.000 40.0 NA NA NA NA 0.135 Subclonal Pending
22-18644672-18644673-C-T 2 3 USP18 A124V 1 RPLELVYCL ENST00000215794.7 Not Supported HLA-B*35:01 6 None 5 471.710 399.275 1.100 0.9 NA NA NA NA 0.053 Subclonal Pending
22-46653595-46653596-G-A 3 4 PKDREJ T1875I 1 YSYGLLHIY ENST00000253255.5 Not Supported HLA-B*35:01 8 None 6 69.335 39.921 0.890 0.43 NA NA NA NA 0.233 Poor Pending
4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A 2 1 RBM47 AAAAAAAA495-502A 1 SAAAAAAAV ENST00000381793.2 Not Supported HLA-B*35:01 9 None 2 508.630 427.517 2.600 2.5 NA NA NA NA 0.977 Poor Pending
22-22550509-22550510-T-G 2 IGLV6-57 S63A 1 GSAPTTVIY ENST00000390285.3 Not Supported HLA-B*35:01 3 None 2 567.080 901.875 3.400 4.1 NA NA NA NA 0.571 Poor Pending
22-50682229-50682230-T-C 2 3 TUBGCP6 H220R 1 SLFGALVRS ENST00000248846.5 Not Supported HLA-A*02:01 8 None 5 623.290 393.78 4.398 3.6 NA NA NA NA 0.686 Poor Pending
22-38027027-38027028-C-G 1 1 GGA1 P484A 1 LLHTVSPEPA ENST00000343632.4 Not Supported HLA-A*02:01 10 None 2 723.710 7116.565 3.000 18.0 NA NA NA NA 0.486 Poor Pending
22-19175521-19175522-G-T 2 CLTCL1 H1469N 1 SVNEALNNL ENST00000263200.10 Not Supported HLA-A*02:01 8 None 2 1010.045 793.83 4.898 4.9 NA NA NA NA 0.100 Poor Pending
22-41920894-41920895-G-C 1 ACO2 E510Q 1 NPQTDYLTG ENST00000216254.4 Not Supported HLA-B*35:01 3 None 1 3121.315 3595.485 6.602 6.5 NA NA NA NA 0.250 Poor Pending
22-41895790-41895791-C-A 1 ACO2 A33E 1 EMSHFEPNEY ENST00000216254.4 Not Supported HLA-B*35:01 1 None 1 3903.700 4456.03 2.600 3.7 NA NA NA NA 0.044 Poor Pending
22-26936775-26936776-G-T 1 TPST2 P274H 1 DLIGKHGGV ENST00000338754.4 Not Supported HLA-A*02:01 6 None 1 4145.685 3402.989 13.000 7.7 NA NA NA NA 0.179 Poor Pending
22-20709231-20709232-G-C FAM230A E322Q 0 IANQDAAQG ENST00000434783.3 Not Supported HLA-B*35:01 4 None 0 5738.350 5541.375 10.500 7.674 NA NA NA NA 0.500 Poor Pending
22-50869713-50869714-C-A PPP6R2 S414Y 0 YESRVEPPH ENST00000395741.3 Not Supported HLA-B*35:01 1 None 0 6159.300 18030.513 14.000 33.0 NA NA NA NA 0.043 Poor Pending
22-22550449-22550450-C-G IGLV6-57 R43G 0 ISCTGSSGSI ENST00000390285.3 Not Supported HLA-A*02:01 5 None 0 9672.631 20663.37 30.000 34.0 NA NA NA NA 1.000 Poor Pending
22-18020271-18020272-G-A CECR2 R535H 0 SGGSHVWTH ENST00000262608.8 Not Supported HLA-B*35:01 9 None 0 10761.970 30462.795 24.000 42.0 NA NA NA NA 0.071 Poor Pending
22-29886116-29886117-C-A NEFH P830T 0 SPVKEEEKT ENST00000310624.6 Not Supported HLA-B*35:01 9 None 0 12628.540 17588.29 26.000 32.0 NA NA NA NA 0.038 Poor Pending
22-37966274-37966275-C-G LGALS2 E132Q 0 FNMSSFKLKQ ENST00000215886.4 Not Supported HLA-A*02:01 10 None 0 16963.785 16973.74 33.000 39.0 NA NA NA NA 0.496 Poor Pending
22-50555769-50555770-G-A MOV10L1 A482T 0 SAKTTVVVTT ENST00000262794.5 Not Supported HLA-A*02:01 10 None 0 27206.914 17665.8 40.000 26.0 NA NA NA NA 0.042 Poor Pending
ID Index E*01:01 G*01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
22-41920894-41920895-G-C 19.ACO2.ENST00000216254.4.missense.510E/Q 2 1 ACO2 E510Q 1 KFNPQTDYL ENST00000216254.4 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor Pending
22-22550509-22550510-T-G 10.IGLV6-57.ENST00000390285.3.missense.63S/A 2 1 IGLV6-57 S63A 1 QRPGSAPTT ENST00000390285.3 Not Supported HLA-E*01:01 6 None 3 1362.110 1517.76 0.300 0.3 NA NA NA NA 0.571 Poor Pending
22-46653595-46653596-G-A 20.PKDREJ.ENST00000253255.5.missense.1875T/I 1 5 PKDREJ T1875I 1 LYYSYGLLHI ENST00000253255.5 Not Supported HLA-G*01:09 10 None 6 1469.280 2365.16 0.300 0.4 NA NA NA NA 0.233 Poor Pending
22-38027027-38027028-C-G 15.GGA1.ENST00000343632.4.missense.484P/A 2 GGA1 P484A 1 ARPPQQPVP ENST00000343632.4 Not Supported HLA-E*01:01 1 None 2 1654.990 4242.33 0.400 4.8 NA NA NA NA 0.486 Poor Pending
22-39994238-39994239-G-A 17.CACNA1I.ENST00000402142.3.missense.107C/Y 2 1 CACNA1I C107Y 1 YQPCDDMDY ENST00000402142.3 Not Supported HLA-E*01:01 9 None 3 1864.160 1804.62 0.600 0.6 NA NA NA NA 0.043 Poor Pending
6-41754573-41754573-C-CCTT 3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L 1 1 PRICKLE4 -287-288L 1 ATLSRTLLL ENST00000458694.1 Not Supported HLA-E*01:01 9 None 1 2122.610 3272.12 0.120 5.1 NA NA NA NA 0.158 Poor Pending
22-18020271-18020272-G-A 5.CECR2.ENST00000262608.8.missense.535R/H 1 CECR2 R535H 1 WTHSRDPEG ENST00000262608.8 Not Supported HLA-E*01:01 3 None 1 2523.800 1765.99 1.400 0.5 NA NA NA NA 0.071 Poor Pending
2-217498305-217498305-T-TGCTGCC 1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP 2 1 IGFBP2 L20LLP 1 LLPLLPLLL ENST00000233809.4 Not Supported HLA-E*01:01 6-7 None 2 2551.250 3099.81 0.170 0.33 NA NA NA NA 0.891 Poor Pending
22-18644672-18644673-C-T 6.USP18.ENST00000215794.7.missense.124A/V 1 USP18 A124V 1 LVYCLQKCN ENST00000215794.7 Not Supported HLA-G*01:09 2 None 1 3099.810 6399.79 3.301 12.0 NA NA NA NA 0.053 Poor Pending
22-50615580-50615581-C-T 22.PANX2.ENST00000395842.2.missense.147S/F 1 1 PANX2 S147F 1 FLAFTRLTS ENST00000395842.2 Not Supported HLA-E*01:01 4 None 2 3343.700 3202.08 2.699 2.4 NA NA NA NA 0.959 Poor Pending
22-37771017-37771018-G-A 13.ELFN2.ENST00000402918.2.missense.186P/L 1 ELFN2 P186L 1 MVCELAGNL ENST00000402918.2 Not Supported HLA-G*01:09 9 None 1 3454.020 10191.11 4.102 21.0 NA NA NA NA 0.135 Poor Pending
22-37966274-37966275-C-G 14.LGALS2.ENST00000215886.4.missense.132E/Q 1 LGALS2 E132Q 1 NMSSFKLKQ ENST00000215886.4 Not Supported HLA-E*01:01 9 None 1 3890.570 3848.7 4.000 3.9 NA NA NA NA 0.496 Poor Pending
22-41895790-41895791-C-A 18.ACO2.ENST00000216254.4.missense.33A/E 2 ACO2 A33E 1 EMSHFEPNE ENST00000216254.4 Not Supported HLA-E*01:01 1 None 2 3932.890 2443.19 4.199 1.3 NA NA NA NA 0.044 Poor Pending
22-50682229-50682230-T-C 23.TUBGCP6.ENST00000248846.5.missense.220H/R 1 1 TUBGCP6 H220R 1 RSRTYDMDV ENST00000248846.5 Not Supported HLA-G*01:09 1 None 2 4576.120 7609.38 6.301 15.0 NA NA NA NA 0.686 Poor Pending
22-19175521-19175522-G-T 7.CLTCL1.ENST00000263200.10.missense.1469H/N 1 CLTCL1 H1469N 1 SVNEALNNL ENST00000263200.10 Not Supported HLA-G*01:09 8 None 1 4989.870 7775.84 7.301 15.0 NA NA NA NA 0.100 Poor Pending
22-50869713-50869714-C-A 24.PPP6R2.ENST00000395741.3.missense.414S/Y PPP6R2 S414Y 0 GYESRVEPP ENST00000395741.3 Not Supported HLA-G*01:09 2 None 0 5620.540 22209.93 8.703 44.0 NA NA NA NA 0.043 Poor Pending
22-20709231-20709232-G-C 8.FAM230A.ENST00000434783.3.missense.322E/Q FAM230A E322Q 0 ANQDAAQGI ENST00000434783.3 Not Supported HLA-G*01:09 3 None 0 5681.680 9654.43 8.797 20.0 NA NA NA NA 0.500 Poor Pending
22-26936775-26936776-G-T 11.TPST2.ENST00000338754.4.missense.274P/H TPST2 P274H 0 KHGGVSLSK ENST00000338754.4 Not Supported HLA-E*01:01 2 None 0 6539.790 20149.14 13.000 53.0 NA NA NA NA 0.179 Poor Pending
22-29886116-29886117-C-A 12.NEFH.ENST00000310624.6.missense.830P/T NEFH P830T 0 KTQEVKVKE ENST00000310624.6 Not Supported HLA-G*01:09 2 None 0 7366.350 25842.4 14.000 50.0 NA NA NA NA 0.038 Poor Pending
22-38119219-38119220-GA-G 16.TRIOBP.ENST00000406386.3.FS.219GA/G TRIOBP FS219 0 GEKAGCPWS ENST00000406386.3 Not Supported HLA-E*01:01 0-10 None 0 8119.760 NA 18.000 NA NA NA NA NA 0.768 Poor Pending
22-22550449-22550450-C-G 9.IGLV6-57.ENST00000390285.3.missense.43R/G IGLV6-57 R43G 0 KTVTISCTG ENST00000390285.3 Not Supported HLA-G*01:09 9 None 0 9447.760 15208.39 19.000 31.0 NA NA NA NA 1.000 Poor Pending
22-50555769-50555770-G-A 21.MOV10L1.ENST00000262794.5.missense.482A/T MOV10L1 A482T 0 KTTVVVTTQ ENST00000262794.5 Not Supported HLA-G*01:09 8 None 0 10874.650 16763.86 23.000 34.0 NA NA NA NA 0.042 Poor Pending
4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A 2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A RBM47 AAAAAAAA495-502A 0 SAAAAAAAV ENST00000381793.2 Not Supported HLA-E*01:01 8-9 None 0 21040.320 21040.32 21.000 30.0 NA NA NA NA 0.977 Poor Pending
Loading

0 comments on commit 6f19436

Please sign in to comment.