Merge pull request #1153 from griffithlab/ref_match

Fix bug where the incorrect neoantigen fasta entry was used for ref proteome search
griffithlab · Dec 12, 2024 · 346b0db · 346b0db
2 parents e29e082 + 7905faf
commit 346b0db
Show file tree

Hide file tree

Showing 28 changed files with 13,145 additions and 13,166 deletions.
diff --git a/docs/pvacseq/output_files.rst b/docs/pvacseq/output_files.rst
@@ -305,6 +305,8 @@ included eptiopes, selecting the best-scoring epitope, and which values are outp
      - Description
    * - ``ID``
      - A unique identifier for the variant
+   * - ``Index``
+     - A unique identifier for the variant and Best Transcript
    * - ``HLA Alleles`` (multiple)
      - For each HLA allele in the run, the number of this variant's epitopes that bound well
        to the HLA allele (with median/lowest mutant binding affinity < binding_threshold)

diff --git a/pvactools/lib/aggregate_all_epitopes.py b/pvactools/lib/aggregate_all_epitopes.py
@@ -165,7 +165,7 @@ def determine_used_percentile_algorithms(self, prediction_algorithms, el_algorit
 
     def determine_columns_used_for_aggregation(self, prediction_algorithms, el_algorithms):
         used_columns = [
-            "Chromosome", "Start", "Stop", "Reference", "Variant",
+            "Index", "Chromosome", "Start", "Stop", "Reference", "Variant",
             "Transcript", "Transcript Support Level", "Biotype", "Transcript Length", "Variant Type", "Mutation",
             "Protein Position", "Gene Name", "HLA Allele",
             "Mutation Position", "MT Epitope Seq", "WT Epitope Seq",
@@ -754,7 +754,7 @@ def assemble_result_line(self, best, key, vaf_clonal, hla, anno_count, included_
         problematic_positions = best['Problematic Positions'] if 'Problematic Positions' in best else 'None'
         tsl = best['Transcript Support Level'] if best['Transcript Support Level'] == "Not Supported" or pd.isna(best['Transcript Support Level']) else str(int(best['Transcript Support Level']))
 
-        out_dict = { 'ID': key }
+        out_dict = { 'ID': key, 'Index': best['Index'] }
         out_dict.update({ k.replace('HLA-', ''):v for k,v in sorted(hla.items()) })
         out_dict.update({
             'Gene': best["Gene Name"],

diff --git a/pvactools/lib/calculate_reference_proteome_similarity.py b/pvactools/lib/calculate_reference_proteome_similarity.py
@@ -259,22 +259,10 @@ def metric_headers(self):
 
 
     def _input_tsv_type(self, line):
-        if self.file_type == 'pVACseq' and 'Index' in line:
-            return 'full'
-        elif self.file_type != 'pVACseq' and 'Mutation' in line:
-            return 'full'
-        else:
+        if 'Best Peptide' in line:
             return 'aggregated'
-
-    def _get_full_peptide(self, line, mt_records_dict, wt_records_dict):
-        for record_id in mt_records_dict.keys():
-            (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
-            (count, gene, transcript) = rest_record_id.split(".", 2)
-            (parsed_aa_change, pos, wt_aa, mt_aa) = index_to_aggregate_report_aa_change(aa_change, variant_type)
-            if line['Best Transcript'] == transcript and line['AA Change'] == parsed_aa_change:
-                return (mt_records_dict[record_id], wt_records_dict[record_id], variant_type, mt_aa, wt_aa)
-        print("Unable to find full_peptide for variant {}".format(line['ID']))
-        return (None, None, variant_type, mt_aa, wt_aa)
+        else:
+            return 'full'
 
     def _get_peptide(self, line, mt_records_dict, wt_records_dict):
         ## Get epitope, peptide and full_peptide
@@ -287,28 +275,17 @@ def _get_peptide(self, line, mt_records_dict, wt_records_dict):
         else:
             if self._input_tsv_type(line) == 'aggregated':
                 epitope = line['Best Peptide']
-                (full_peptide, wt_peptide, variant_type, mt_amino_acids, wt_amino_acids) = self._get_full_peptide(line, mt_records_dict, wt_records_dict)
-                if full_peptide is None:
-                    return None, None
-                if variant_type != 'FS':
-                    if line['Pos'] == 'NA':
-                        mt_pos = None
-                        for i,(wt_aa,mt_aa) in enumerate(zip(wt_peptide,full_peptide)):
-                            if wt_aa != mt_aa:
-                                mt_pos = i
-                                break
-                        if mt_pos is None:
-                            return None, full_peptide
-                    else:
-                        mt_pos = int(line['Pos'].split('-')[0])
+                (rest_record_id, variant_type, aa_change) = line['Index'].rsplit(".", 2)
+                (_, mt_pos, wt_amino_acids, mt_amino_acids) = index_to_aggregate_report_aa_change(aa_change, variant_type)
+                mt_pos = int(line['Pos'].split('-')[0])
             else:
                 epitope = line['MT Epitope Seq']
-                full_peptide = mt_records_dict[line['Index']]
-                wt_peptide = wt_records_dict[line['Index']]
                 variant_type = line['Variant Type']
                 if variant_type != 'FS':
                     mt_pos = int(line['Mutation Position'].split('-')[0])
                     (wt_amino_acids, mt_amino_acids) = line['Mutation'].split('/')
+            full_peptide = mt_records_dict[line['Index']]
+            wt_peptide = wt_records_dict[line['Index']]
 
             # get peptide
             subpeptide_position = full_peptide.index(epitope)