Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent pVACvector from clipping Best Peptide #1174

Merged
merged 2 commits into from
Jan 15, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Prevent pVACvector from clipping Best Peptide
  • Loading branch information
susannasiebert committed Jan 7, 2025
commit 9d316637325faa8bab8219410190b5e7caab6c62
38 changes: 25 additions & 13 deletions pvactools/lib/fasta_generator.py
Original file line number Diff line number Diff line change
@@ -7,6 +7,8 @@
from abc import ABCMeta
from Bio import SeqIO
import itertools
import logging

from pvactools.lib.proximal_variant import ProximalVariant

csv.field_size_limit(sys.maxsize)
@@ -364,13 +366,21 @@ def __init__(self, **kwargs):

def execute(self):
seq_dict = dict()
best_peptide = dict()
for record in SeqIO.parse(self.input_file, "fasta"):
seq_dict[record.id] = str(record.seq)
description = record.description.replace("{} ".format(record.id), "")
if description != "":
try:
best_peptide[record.id] = json.loads(description)['Best Peptide']
except:
pass

for length in self.epitope_lengths:
epitopes = dict()
fasta_sequences = OrderedDict()
wingspan_length = length - 1
warnings = set()
for (seq1, seq2) in self.junctions_to_test:
seq1_seq = seq_dict[seq1]
seq2_seq = seq_dict[seq2]
@@ -379,6 +389,19 @@ def execute(self):
#These combinations would've already been tested in previous attempts with lower clip lengths and can be skipped
if left_clip_length < self.clip_length and right_clip_length < self.clip_length:
continue
if seq1 in best_peptide:
seq1_best_peptide = best_peptide[seq1]
last_position = seq1_seq.rindex(seq1_best_peptide) + len(seq1_best_peptide)
end_distance = len(seq1_seq) - last_position
if left_clip_length > end_distance:
warnings.add("Clipping {} amino acids off the end of peptide {} would clip the best peptide. Skipping.".format(left_clip_length, seq1))
continue
if seq2 in best_peptide:
seq2_best_peptide = best_peptide[seq2]
first_position = seq2_seq.index(seq2_best_peptide)
if right_clip_length > first_position:
warnings.add("Clipping {} amino acids off the start of peptide {} would clip the best peptide. Skipping.".format(right_clip_length, seq2))
continue
trunc_seq1 = seq1_seq[(len(seq1_seq) - wingspan_length):(len(seq1_seq) - left_clip_length)]
trunc_seq2 = seq2_seq[(0 + right_clip_length):wingspan_length]

@@ -388,6 +411,8 @@ def execute(self):
else:
seq_ID = "{}|{}|{}|{}".format(seq1, left_clip_length, right_clip_length, seq2)
epitopes[seq_ID] = trunc_seq1 + trunc_seq2
for warning in list(warnings):
logging.info(warning)

for seq_id in epitopes:
sequence = epitopes[seq_id]
@@ -409,16 +434,3 @@ def execute(self):

writer.close()
key_writer.close()

def combine_problematic_peptides(self, seq_dict):
seq_tuples = []
for (seq_id, data) in seq_dict.items():
other_seq_ids = list(seq_dict.keys())
other_seq_ids.remove(seq_id)
if data['problematic_start']:
for other_seq_id in other_seq_ids:
seq_tuples.append((other_seq_id, seq_id))
if data['problematic_end']:
for other_seq_id in other_seq_ids:
seq_tuples.append((seq_id, other_seq_id))
return list(set(seq_tuples))
57 changes: 27 additions & 30 deletions pvactools/tools/pvacseq/generate_protein_fasta.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
import yaml
import csv
import re
import json
from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
@@ -125,14 +126,15 @@ def generate_fasta(flanking_sequence_length, downstream_sequence_length, temp_di
def parse_input_tsv(input_tsv):
if input_tsv is None:
return (None, None)
indexes = []
with open(input_tsv, 'r') as fh:
reader = csv.DictReader(fh, delimiter = "\t")
if 'Index' in reader.fieldnames:
indexes = parse_full_input_tsv(reader)
file_type = 'full'
else:
if 'Best Peptide' in reader.fieldnames:
indexes = parse_aggregated_input_tsv(reader)
file_type = 'aggregated'
else:
indexes = parse_full_input_tsv(reader)
file_type = 'full'
return (indexes, file_type)

def parse_full_input_tsv(reader):
@@ -142,9 +144,9 @@ def parse_full_input_tsv(reader):
return indexes

def parse_aggregated_input_tsv(reader):
indexes = []
indexes = {}
for line in reader:
indexes.append(line)
indexes[line['Index']] = line
return indexes

def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_evaluation):
@@ -155,7 +157,7 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
with open(fasta_key_file_path, 'r') as fasta_key_file:
keys = yaml.load(fasta_key_file, Loader=yaml.FullLoader)

(tsv_indexes, tsv_file_type) = parse_input_tsv(input_tsv)
(tsv_indexes, file_type) = parse_input_tsv(input_tsv)

dataframe = OrderedDict()
output_records = []
@@ -165,33 +167,28 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
if mutant_only and record_id.startswith('WT.'):
continue
if tsv_indexes is not None:
if tsv_file_type == 'full':
sequence_type, index = record_id.split('.', 1)
if index not in tsv_indexes:
continue
else:
(rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
(peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
(parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
matches = [i for i in tsv_indexes if i['Best Transcript'] == transcript and i['AA Change'] == parsed_aa_change and i['Evaluation'] in aggregate_report_evaluation]
if len(matches) == 0:
continue
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)
sequence_type, index = record_id.split('.', 1)
if file_type == 'full':
if index in tsv_indexes:
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)
elif file_type == 'aggregated':
if index in tsv_indexes.keys() and tsv_indexes[index]['Evaluation'] in aggregate_report_evaluation:
if record_id.startswith('MT.'):
annotations = { 'Best Peptide': tsv_indexes[index]['Best Peptide'] }
new_record = SeqRecord(record.seq, id=record_id, description=json.dumps(annotations))
else:
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)
else:
new_record = SeqRecord(record.seq, id=record_id, description=record_id)
output_records.append(new_record)

if tsv_indexes is not None:
ordered_output_records = []
for tsv_index in tsv_indexes:
if tsv_file_type == 'full':
records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index]
ordered_output_records.extend(records)
else:
for output_record in output_records:
(rest_record_id, variant_type, aa_change) = output_record.id.rsplit(".", 2)
(peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
(parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
if tsv_index['Best Transcript'] == transcript and tsv_index['AA Change'] == parsed_aa_change:
ordered_output_records.append(output_record)
records = [r for r in output_records if r.id.split('.', 1)[1] == tsv_index]
ordered_output_records.extend(records)
output_records = ordered_output_records

SeqIO.write(output_records, output_file, "fasta")
48 changes: 24 additions & 24 deletions tests/test_data/pvacseq_generate_protein_fasta/input.aggregated.tsv
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
ID A*02:01 B*35:01 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
2-217498305-217498305-T-TGCTGCC 9 4 IGFBP2 L20LLP 1 PLLPLLPLL ENST00000233809.4 Not Supported HLA-A*02:01 7 None 12 129.750 221.227 1.400 2.5 NA NA NA NA 0.891 Pass Pending
22-38119219-38119220-GA-G 1 TRIOBP FS219 1 CPWSGTGQH ENST00000406386.3 Not Supported HLA-B*35:01 0 None 1 212.962 NA 1.500 NA NA NA NA NA 0.768 Pass Pending
22-50615580-50615581-C-T 6 3 PANX2 S147F 1 ALGWEFLAFT ENST00000395842.2 Not Supported HLA-A*02:01 9 None 7 168.895 227.61 1.693 1.707 NA NA NA NA 0.959 Anchor Pending
22-39994238-39994239-G-A 3 3 CACNA1I C107Y 1 YLSDRCKIL ENST00000402142.3 Not Supported HLA-A*02:01 1 None 6 134.880 2639.458 2.100 8.2 NA NA NA NA 0.043 Subclonal Pending
6-41754573-41754573-C-CCTT 4 PRICKLE4 -287-288L 1 TLSRTLLLAA ENST00000458694.1 Not Supported HLA-A*02:01 8 None 4 312.720 332.88 2.900 2.8 NA NA NA NA 0.158 Subclonal Pending
22-37771017-37771018-G-A 5 3 ELFN2 P186L 1 NLFNCECDL ENST00000402918.2 Not Supported HLA-A*02:01 2 None 7 387.055 27322.265 2.000 40.0 NA NA NA NA 0.135 Subclonal Pending
22-18644672-18644673-C-T 2 3 USP18 A124V 1 RPLELVYCL ENST00000215794.7 Not Supported HLA-B*35:01 6 None 5 471.710 399.275 1.100 0.9 NA NA NA NA 0.053 Subclonal Pending
22-46653595-46653596-G-A 3 4 PKDREJ T1875I 1 YSYGLLHIY ENST00000253255.5 Not Supported HLA-B*35:01 8 None 6 69.335 39.921 0.890 0.43 NA NA NA NA 0.233 Poor Pending
4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A 2 1 RBM47 AAAAAAAA495-502A 1 SAAAAAAAV ENST00000381793.2 Not Supported HLA-B*35:01 9 None 2 508.630 427.517 2.600 2.5 NA NA NA NA 0.977 Poor Pending
22-22550509-22550510-T-G 2 IGLV6-57 S63A 1 GSAPTTVIY ENST00000390285.3 Not Supported HLA-B*35:01 3 None 2 567.080 901.875 3.400 4.1 NA NA NA NA 0.571 Poor Pending
22-50682229-50682230-T-C 2 3 TUBGCP6 H220R 1 SLFGALVRS ENST00000248846.5 Not Supported HLA-A*02:01 8 None 5 623.290 393.78 4.398 3.6 NA NA NA NA 0.686 Poor Pending
22-38027027-38027028-C-G 1 1 GGA1 P484A 1 LLHTVSPEPA ENST00000343632.4 Not Supported HLA-A*02:01 10 None 2 723.710 7116.565 3.000 18.0 NA NA NA NA 0.486 Poor Pending
22-19175521-19175522-G-T 2 CLTCL1 H1469N 1 SVNEALNNL ENST00000263200.10 Not Supported HLA-A*02:01 8 None 2 1010.045 793.83 4.898 4.9 NA NA NA NA 0.100 Poor Pending
22-41920894-41920895-G-C 1 ACO2 E510Q 1 NPQTDYLTG ENST00000216254.4 Not Supported HLA-B*35:01 3 None 1 3121.315 3595.485 6.602 6.5 NA NA NA NA 0.250 Poor Pending
22-41895790-41895791-C-A 1 ACO2 A33E 1 EMSHFEPNEY ENST00000216254.4 Not Supported HLA-B*35:01 1 None 1 3903.700 4456.03 2.600 3.7 NA NA NA NA 0.044 Poor Pending
22-26936775-26936776-G-T 1 TPST2 P274H 1 DLIGKHGGV ENST00000338754.4 Not Supported HLA-A*02:01 6 None 1 4145.685 3402.989 13.000 7.7 NA NA NA NA 0.179 Poor Pending
22-20709231-20709232-G-C FAM230A E322Q 0 IANQDAAQG ENST00000434783.3 Not Supported HLA-B*35:01 4 None 0 5738.350 5541.375 10.500 7.674 NA NA NA NA 0.500 Poor Pending
22-50869713-50869714-C-A PPP6R2 S414Y 0 YESRVEPPH ENST00000395741.3 Not Supported HLA-B*35:01 1 None 0 6159.300 18030.513 14.000 33.0 NA NA NA NA 0.043 Poor Pending
22-22550449-22550450-C-G IGLV6-57 R43G 0 ISCTGSSGSI ENST00000390285.3 Not Supported HLA-A*02:01 5 None 0 9672.631 20663.37 30.000 34.0 NA NA NA NA 1.000 Poor Pending
22-18020271-18020272-G-A CECR2 R535H 0 SGGSHVWTH ENST00000262608.8 Not Supported HLA-B*35:01 9 None 0 10761.970 30462.795 24.000 42.0 NA NA NA NA 0.071 Poor Pending
22-29886116-29886117-C-A NEFH P830T 0 SPVKEEEKT ENST00000310624.6 Not Supported HLA-B*35:01 9 None 0 12628.540 17588.29 26.000 32.0 NA NA NA NA 0.038 Poor Pending
22-37966274-37966275-C-G LGALS2 E132Q 0 FNMSSFKLKQ ENST00000215886.4 Not Supported HLA-A*02:01 10 None 0 16963.785 16973.74 33.000 39.0 NA NA NA NA 0.496 Poor Pending
22-50555769-50555770-G-A MOV10L1 A482T 0 SAKTTVVVTT ENST00000262794.5 Not Supported HLA-A*02:01 10 None 0 27206.914 17665.8 40.000 26.0 NA NA NA NA 0.042 Poor Pending
ID Index E*01:01 G*01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
22-41920894-41920895-G-C 19.ACO2.ENST00000216254.4.missense.510E/Q 2 1 ACO2 E510Q 1 KFNPQTDYL ENST00000216254.4 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor Pending
22-22550509-22550510-T-G 10.IGLV6-57.ENST00000390285.3.missense.63S/A 2 1 IGLV6-57 S63A 1 QRPGSAPTT ENST00000390285.3 Not Supported HLA-E*01:01 6 None 3 1362.110 1517.76 0.300 0.3 NA NA NA NA 0.571 Poor Pending
22-46653595-46653596-G-A 20.PKDREJ.ENST00000253255.5.missense.1875T/I 1 5 PKDREJ T1875I 1 LYYSYGLLHI ENST00000253255.5 Not Supported HLA-G*01:09 10 None 6 1469.280 2365.16 0.300 0.4 NA NA NA NA 0.233 Poor Pending
22-38027027-38027028-C-G 15.GGA1.ENST00000343632.4.missense.484P/A 2 GGA1 P484A 1 ARPPQQPVP ENST00000343632.4 Not Supported HLA-E*01:01 1 None 2 1654.990 4242.33 0.400 4.8 NA NA NA NA 0.486 Poor Pending
22-39994238-39994239-G-A 17.CACNA1I.ENST00000402142.3.missense.107C/Y 2 1 CACNA1I C107Y 1 YQPCDDMDY ENST00000402142.3 Not Supported HLA-E*01:01 9 None 3 1864.160 1804.62 0.600 0.6 NA NA NA NA 0.043 Poor Pending
6-41754573-41754573-C-CCTT 3.PRICKLE4.ENST00000458694.1.inframe_ins.287-288-/L 1 1 PRICKLE4 -287-288L 1 ATLSRTLLL ENST00000458694.1 Not Supported HLA-E*01:01 9 None 1 2122.610 3272.12 0.120 5.1 NA NA NA NA 0.158 Poor Pending
22-18020271-18020272-G-A 5.CECR2.ENST00000262608.8.missense.535R/H 1 CECR2 R535H 1 WTHSRDPEG ENST00000262608.8 Not Supported HLA-E*01:01 3 None 1 2523.800 1765.99 1.400 0.5 NA NA NA NA 0.071 Poor Pending
2-217498305-217498305-T-TGCTGCC 1.IGFBP2.ENST00000233809.4.inframe_ins.20L/LLP 2 1 IGFBP2 L20LLP 1 LLPLLPLLL ENST00000233809.4 Not Supported HLA-E*01:01 6-7 None 2 2551.250 3099.81 0.170 0.33 NA NA NA NA 0.891 Poor Pending
22-18644672-18644673-C-T 6.USP18.ENST00000215794.7.missense.124A/V 1 USP18 A124V 1 LVYCLQKCN ENST00000215794.7 Not Supported HLA-G*01:09 2 None 1 3099.810 6399.79 3.301 12.0 NA NA NA NA 0.053 Poor Pending
22-50615580-50615581-C-T 22.PANX2.ENST00000395842.2.missense.147S/F 1 1 PANX2 S147F 1 FLAFTRLTS ENST00000395842.2 Not Supported HLA-E*01:01 4 None 2 3343.700 3202.08 2.699 2.4 NA NA NA NA 0.959 Poor Pending
22-37771017-37771018-G-A 13.ELFN2.ENST00000402918.2.missense.186P/L 1 ELFN2 P186L 1 MVCELAGNL ENST00000402918.2 Not Supported HLA-G*01:09 9 None 1 3454.020 10191.11 4.102 21.0 NA NA NA NA 0.135 Poor Pending
22-37966274-37966275-C-G 14.LGALS2.ENST00000215886.4.missense.132E/Q 1 LGALS2 E132Q 1 NMSSFKLKQ ENST00000215886.4 Not Supported HLA-E*01:01 9 None 1 3890.570 3848.7 4.000 3.9 NA NA NA NA 0.496 Poor Pending
22-41895790-41895791-C-A 18.ACO2.ENST00000216254.4.missense.33A/E 2 ACO2 A33E 1 EMSHFEPNE ENST00000216254.4 Not Supported HLA-E*01:01 1 None 2 3932.890 2443.19 4.199 1.3 NA NA NA NA 0.044 Poor Pending
22-50682229-50682230-T-C 23.TUBGCP6.ENST00000248846.5.missense.220H/R 1 1 TUBGCP6 H220R 1 RSRTYDMDV ENST00000248846.5 Not Supported HLA-G*01:09 1 None 2 4576.120 7609.38 6.301 15.0 NA NA NA NA 0.686 Poor Pending
22-19175521-19175522-G-T 7.CLTCL1.ENST00000263200.10.missense.1469H/N 1 CLTCL1 H1469N 1 SVNEALNNL ENST00000263200.10 Not Supported HLA-G*01:09 8 None 1 4989.870 7775.84 7.301 15.0 NA NA NA NA 0.100 Poor Pending
22-50869713-50869714-C-A 24.PPP6R2.ENST00000395741.3.missense.414S/Y PPP6R2 S414Y 0 GYESRVEPP ENST00000395741.3 Not Supported HLA-G*01:09 2 None 0 5620.540 22209.93 8.703 44.0 NA NA NA NA 0.043 Poor Pending
22-20709231-20709232-G-C 8.FAM230A.ENST00000434783.3.missense.322E/Q FAM230A E322Q 0 ANQDAAQGI ENST00000434783.3 Not Supported HLA-G*01:09 3 None 0 5681.680 9654.43 8.797 20.0 NA NA NA NA 0.500 Poor Pending
22-26936775-26936776-G-T 11.TPST2.ENST00000338754.4.missense.274P/H TPST2 P274H 0 KHGGVSLSK ENST00000338754.4 Not Supported HLA-E*01:01 2 None 0 6539.790 20149.14 13.000 53.0 NA NA NA NA 0.179 Poor Pending
22-29886116-29886117-C-A 12.NEFH.ENST00000310624.6.missense.830P/T NEFH P830T 0 KTQEVKVKE ENST00000310624.6 Not Supported HLA-G*01:09 2 None 0 7366.350 25842.4 14.000 50.0 NA NA NA NA 0.038 Poor Pending
22-38119219-38119220-GA-G 16.TRIOBP.ENST00000406386.3.FS.219GA/G TRIOBP FS219 0 GEKAGCPWS ENST00000406386.3 Not Supported HLA-E*01:01 0-10 None 0 8119.760 NA 18.000 NA NA NA NA NA 0.768 Poor Pending
22-22550449-22550450-C-G 9.IGLV6-57.ENST00000390285.3.missense.43R/G IGLV6-57 R43G 0 KTVTISCTG ENST00000390285.3 Not Supported HLA-G*01:09 9 None 0 9447.760 15208.39 19.000 31.0 NA NA NA NA 1.000 Poor Pending
22-50555769-50555770-G-A 21.MOV10L1.ENST00000262794.5.missense.482A/T MOV10L1 A482T 0 KTTVVVTTQ ENST00000262794.5 Not Supported HLA-G*01:09 8 None 0 10874.650 16763.86 23.000 34.0 NA NA NA NA 0.042 Poor Pending
4-40434704-40434725-AGCGGCTGCGGCGGCTGCGGCC-A 2.RBM47.ENST00000381793.2.inframe_del.495-502AAAAAAAA/A RBM47 AAAAAAAA495-502A 0 SAAAAAAAV ENST00000381793.2 Not Supported HLA-E*01:01 8-9 None 0 21040.320 21040.32 21.000 30.0 NA NA NA NA 0.977 Poor Pending
Loading