Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update consequence severity filtering #326

Merged
merged 8 commits into from
May 13, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions consequence_prediction/repeat_expansion_variants/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,9 @@ def extract_consequences(variants):
# Get rid of sets
consequences['RepeatType'] = consequences['RepeatType'].apply(list)
consequences = consequences.explode('RepeatType')
# Form a six-column file compatible with the consequence mapping pipeline, for example:
# RCV000005966 1 ENSG00000156475 PPP2R2B trinucleotide_repeat_expansion 0
consequences['PlaceholderOnes'] = 1
consequences['PlaceholderZeroes'] = 0
consequences = consequences[['RCVaccession', 'PlaceholderOnes', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType',
'PlaceholderZeroes']]
# Form a four-column file compatible with the consequence mapping pipeline, for example:
# RCV000005966 ENSG00000156475 PPP2R2B trinucleotide_repeat_expansion
consequences = consequences[['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType']]
consequences.sort_values(by=['RepeatType', 'RCVaccession', 'EnsemblGeneID'], inplace=True)
# Check that there are no empty cells in the final consequences table
assert consequences.isnull().to_numpy().sum() == 0
Expand Down
20 changes: 8 additions & 12 deletions consequence_prediction/structural_variants/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import pandas as pd

from consequence_prediction.vep_mapping_pipeline.consequence_mapping import query_vep, VEP_SHORT_QUERY_DISTANCE, \
extract_consequences, deduplicate_list
from consequence_prediction.vep_mapping_pipeline.consequence_mapping import query_vep, extract_consequences, \
deduplicate_list
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import ClinVarDataset
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import HgvsVariant, VariantType, SequenceType

Expand Down Expand Up @@ -61,7 +61,7 @@ def get_vep_results(clinvar_xml):
i = 0
vep_results = []
for group in grouper(variants, n=200):
vep_results.extend(query_vep(variants=group, search_distance=VEP_SHORT_QUERY_DISTANCE))
vep_results.extend(query_vep(variants=group))
i += 1
logger.info(f'Done with batch {i}')

Expand All @@ -70,20 +70,16 @@ def get_vep_results(clinvar_xml):

def main(clinvar_xml):
vep_results = get_vep_results(clinvar_xml)
results_by_variant = {}
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
only_closest=False, results_by_variant=results_by_variant,
report_distance=False)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
variant_data = []
for variant_id, variant_consequences in results_by_variant.items():
for consequence_to_yield in deduplicate_list(variant_consequences):
variant_id, gene_id, gene_symbol, consequence_term, distance = consequence_to_yield
variant_id, gene_id, gene_symbol, consequence_term = consequence_to_yield
# variant_id here is the entire input to VEP, we only want the HGVS that we passed as an identifier
hgvs_id = variant_id.split()[-1]
# The second column, set statically to 1, is not used, and is maintained for compatibility purposes
variant_data.append((hgvs_id, '1', gene_id, gene_symbol, consequence_term, distance))
variant_data.append((hgvs_id, gene_id, gene_symbol, consequence_term))

# Return as a dataframe to be compatible with repeat expansion pipeline
consequences = pd.DataFrame(variant_data, columns=('VariantID', 'PlaceholderOnes', 'EnsemblGeneID',
'EnsemblGeneName', 'ConsequenceTerm', 'Distance'))
consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
'EnsemblGeneName', 'ConsequenceTerm'))
return consequences
123 changes: 43 additions & 80 deletions consequence_prediction/vep_mapping_pipeline/consequence_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,13 @@
from retry import retry

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--enable-distant-querying', action='store_true',
help='Enables a second iteration of querying VEP for distant gene variants, which is disabled by default'
)
parser.add_argument(
'--report-distance', action='store_true',
help='Report a distance to the gene for upstream/downstream gene variants. Disabled by default'
)

logging.basicConfig()
logger = logging.getLogger('consequence_mapping')
logger.setLevel(logging.INFO)

# The "distance to the nearest gene" parameters, used to query VEP in first and second iterations, respectively.
# The "distance to the nearest gene" parameters, used to query VEP.
VEP_SHORT_QUERY_DISTANCE = 5000
VEP_LONG_QUERY_DISTANCE = 500000
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What was the reason behind no longer querying VEP twice if the short distance doesn't find any gene? Just curious



def deduplicate_list(lst):
Expand All @@ -54,7 +45,7 @@ def vep_id_to_colon_id(vep_id):


@retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger)
def query_vep(variants, search_distance):
def query_vep(variants, search_distance=VEP_SHORT_QUERY_DISTANCE):
"""Query VEP and return results in JSON format. Upstream/downstream genes are searched up to a given distance."""
ensembl_request_url = 'https://rest.ensembl.org/vep/human/region'
headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
Expand Down Expand Up @@ -97,54 +88,58 @@ def load_consequence_severity_rank():
return {term: index for index, term in enumerate(get_severity_ranking())}


def extract_consequences(vep_results, acceptable_biotypes, only_closest, results_by_variant, report_distance=False):
def extract_consequences(vep_results, acceptable_biotypes):
"""Given VEP results, return a list of consequences matching certain criteria.

Args:
vep_results: results obtained from VEP for a list of variants, in JSON format.
acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see
https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why, among all biotypes, we only consider miRNAs and protein coding genes? Sorry again, has not much to do with this PR but I'm curious.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good question and it seems to have been decided long ago (before the birth of this repo).
There are a few others that we could consider in the non-coding genes, the IG genes and the TR genes.
Probably. a topic to raise with OpenTarget

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created #328 for this, we can discuss at our next meeting.

will be ignored.
only_closest: if this flag is specified, then at most one consequence per variant will be returned. The
consequences are sorted by distance from the gene and the closest one is chosen. In case of a tie,
consequence is selected at random. If this flag is not specified, all consequences for each variant will
be returned.
results_by_variant: a dict to write the results into.
report_distance: if set to True, a distance from the variant to the gene it affects will be reported
(applicable to upstream and downstream gene variants). Otherwise, the distance parameter will always be 0.
"""
consequence_term_severity_rank = load_consequence_severity_rank()
results_by_variant = defaultdict(list)
for result in vep_results:
variant_identifier = result['input']
results_by_variant.setdefault(variant_identifier, [])
consequences = result.get('transcript_consequences', [])

# Keep only consequences with the allowed biotypes; if there are none, skip this variant
consequences = [c for c in consequences if c['biotype'] in acceptable_biotypes]
if not consequences:
continue

# Flatten the list of consequence terms and find the most severe one
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
all_consequence_terms.sort(key=lambda term: consequence_term_severity_rank[term])
most_severe_consequence_term = all_consequence_terms[0]

# Keep only consequences which include the most severe consequence term; sort by increasing order of distance.
# If there is no 'distance' attribute in VEP results, it means that it is not applicable as the variant resides
# *inside* the gene; hence, in this case the distance is set to 0.
consequences = [c for c in consequences if most_severe_consequence_term in c['consequence_terms']]
consequences.sort(key=lambda consequence: abs(consequence.get('distance', 0)))

# If mandated by a flag, keep only one (least distant) consequence
if only_closest:
consequences = [consequences[0]]

# Return a subset of fields (required for output) of filtered consequences
results_by_variant[variant_identifier].extend([
(variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term,
c.get('distance', 0) if report_distance else 0)
for c in consequences
])
# If there is no 'distance' attribute in VEP results, it means the variant overlaps the gene.
overlapping_consequences = [c for c in consequences if 'distance' not in c]

# For genes overlapping the variant, we report the most severe consequence per gene.
if overlapping_consequences:
consequences_per_gene = defaultdict(list)
for c in overlapping_consequences:
key = (c['gene_id'], c.get('gene_symbol', ''))
consequences_per_gene[key].extend(term for term in c['consequence_terms'])
for (gene_id, gene_symbol), terms in consequences_per_gene.items():
most_severe_consequence_term = min(terms, key=lambda term: consequence_term_severity_rank[term])
results_by_variant[variant_identifier].append(
(variant_identifier, gene_id, gene_symbol, most_severe_consequence_term)
)
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved

# If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes
# associated with that consequence
else:
# Flatten the list of consequence terms and find the most severe one
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
all_consequence_terms.sort(key=lambda term: consequence_term_severity_rank[term])
most_severe_consequence_term = all_consequence_terms[0]

# Keep only consequences which include the most severe consequence term.
consequences = [c for c in consequences if most_severe_consequence_term in c['consequence_terms']]
consequences.sort(key=lambda consequence: abs(consequence.get('distance', 0)))

# Return a subset of fields (required for output) of filtered consequences
results_by_variant[variant_identifier].extend([
(variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term)
for c in consequences
])

return results_by_variant

Expand All @@ -158,50 +153,21 @@ def get_variants_without_consequences(results_by_variant):
})


def process_variants(variants, enable_distant_querying=False, report_distance=False):
def process_variants(variants):
"""Given a list of variant IDs, return a list of consequence types (each including Ensembl gene name & ID and a
functional consequence code) for a given variant.

Args:
enable_distant_querying: If set to True, an additional VEP query will be performed for variants for which no
consequences were found during the first iteration, in an attempt to find distant variant consequences.
report_distance: Whether to report distance to the nearest gene for upstream and downstream gene variants.
See extract_consequences() for details.
"""

# First, we query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
# Query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
# up to a standard distance (5000 nucleotides either way, which is default for VEP) from the variant.
results_by_variant = {}
vep_results = query_vep(variants=variants, search_distance=VEP_SHORT_QUERY_DISTANCE)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
only_closest=False, results_by_variant=results_by_variant,
report_distance=report_distance)
vep_results = query_vep(variants=variants)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})

# See if there are variants with no consequences up to the default distance
variants_without_consequences = get_variants_without_consequences(results_by_variant)
if variants_without_consequences:
logger.info('Found {} variant(s) without standard consequences: {}'.format(
len(variants_without_consequences), '|'.join(variants_without_consequences)))

if enable_distant_querying:
logger.info('Attempting to find distant consequences for the remaining variants')

# If there are, we will now do a second round of querying, this time looking only at protein coding biotypes
# (vs. miRNA *and* protein coding during the first round) up to a distance of 500,000 bases each way.
if variants_without_consequences:
distant_vep_results = query_vep(variants=variants_without_consequences,
search_distance=VEP_LONG_QUERY_DISTANCE)
extract_consequences(vep_results=distant_vep_results, acceptable_biotypes={'protein_coding'},
only_closest=True, results_by_variant=results_by_variant,
report_distance=report_distance)

# See if there are still variants with no consequences, even up to a wide search window
variants_without_consequences = get_variants_without_consequences(results_by_variant)
if variants_without_consequences:
logger.info('After distant querying, still remaining {} variant(s) without consequences: {}'.format(
len(variants_without_consequences), '|'.join(variants_without_consequences)
))

# Yield all consequences for all variants. Note they are not grouped by variant, all consequences are yielded in a
# common sequence.
for variant_id, variant_consequences in results_by_variant.items():
Expand All @@ -217,12 +183,9 @@ def main():
variants_to_query = [colon_based_id_to_vep_id(v) for v in sys.stdin.read().splitlines()]

# Query VEP with all variants at once (for the purpose of efficiency), print out the consequences to STDOUT.
consequences = process_variants(variants_to_query,
enable_distant_querying=args.enable_distant_querying,
report_distance=args.report_distance)
for variant_id, gene_id, gene_symbol, consequence_term, distance in consequences:
# The second column, set statically to 1, is not used, and is maintained for compatibility purposes
print('\t'.join([vep_id_to_colon_id(variant_id), '1', gene_id, gene_symbol, consequence_term, str(distance)]))
consequences = process_variants(variants_to_query)
for variant_id, gene_id, gene_symbol, consequence_term in consequences:
print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term]))

logger.info('Successfully processed {} variants'.format(len(variants_to_query)))

Expand Down
10 changes: 5 additions & 5 deletions eva_cttv_pipeline/evidence_string_generation/consequence_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def process_consequence_type_dataframes(*dataframes):
for consequences_dataframe in dataframes:
for row in consequences_dataframe.itertuples():
variant_id = row[1]
ensembl_gene_id = row[3]
so_term = row[5]
ensembl_gene_id = row[2]
so_term = row[4]

process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)

Expand All @@ -44,13 +44,13 @@ def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
line = line.rstrip()
line_list = line.split("\t")

if len(line_list) < 6:
if len(line_list) < 4:
logger.warning('Skip invalid line in snp_2_gene file: {}'.format(line))
continue

variant_id = line_list[0]
ensembl_gene_id = line_list[2]
so_term = line_list[4]
ensembl_gene_id = line_list[1]
so_term = line_list[3]

if ensembl_gene_id == 'NA':
logger.warning('Skip line with missing gene ID: {}'.format(line))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,46 +54,46 @@ def test_explicit_coordinates():
"""Repeat expansion events with complete coordinates must be processed with the correct type."""
assert sorted(run_pipeline('explicit_coords.xml.gz')) == sorted([
# CGC expansion, trinucleotide.
['RCV001051772', '1', 'ENSG00000130711', 'PRDM12', 'trinucleotide_repeat_expansion', '0'],
['RCV001051772', 'ENSG00000130711', 'PRDM12', 'trinucleotide_repeat_expansion'],
# CCGGGACCGAGA (12 base unit) expansion, also to be considered a trinucleotide expansion.
['RCV000722291', '1', 'ENSG00000142599', 'RERE', 'trinucleotide_repeat_expansion', '0'],
['RCV000722291', 'ENSG00000142599', 'RERE', 'trinucleotide_repeat_expansion'],
# CT expansion, non-trinucleotide.
['RCV000292700', '1', 'ENSG00000163554', 'SPTA1', 'short_tandem_repeat_expansion', '0'],
['RCV000292700', 'ENSG00000163554', 'SPTA1', 'short_tandem_repeat_expansion'],
# TCAT expansion, non-trinucleotide but could be mistaken for one (3 units of 4 = 12, divisible by 3).
['RCV000122358', '1', 'ENSG00000135100', 'HNF1A', 'short_tandem_repeat_expansion', '0'],
['RCV000122358', 'ENSG00000135100', 'HNF1A', 'short_tandem_repeat_expansion'],
])


def test_no_explicit_coordinates():
"""Repeat expansion events without complete coordinates must also be processed and parsed."""
assert run_pipeline('no_explicit_coords.xml.gz') == [
# NM_023067.3(FOXL2):c.661GCN[15_24] (p.Ala221[(15_24)]) should be parsed as a trinucleotide expansion
['RCV000192035', '1', 'ENSG00000183770', 'FOXL2', 'trinucleotide_repeat_expansion', '0'],
['RCV000192035', 'ENSG00000183770', 'FOXL2', 'trinucleotide_repeat_expansion'],
]


def test_alternative_identifiers():
"""Protein HGVS and human-readable identifiers should also be processed."""
assert sorted(run_pipeline('alternatives.xml.gz')) == sorted([
# NP_003915.2:p.Ala260(5_9) is protein hgvs and assumed to be trinucleotide expansion
['RCV000006377', '1', 'ENSG00000109132', 'PHOX2B', 'trinucleotide_repeat_expansion', '0'],
['RCV000006377', 'ENSG00000109132', 'PHOX2B', 'trinucleotide_repeat_expansion'],
# ATXN2, (CAG)n REPEAT EXPANSION is inferred to be trinucleotide from the repeat unit length
['RCV000008583', '1', 'ENSG00000204842', 'ATXN2', 'trinucleotide_repeat_expansion', '0']
['RCV000008583', 'ENSG00000204842', 'ATXN2', 'trinucleotide_repeat_expansion']
])


def test_missing_names():
"""Records that are missing variant names are still processed using HGVS identifier."""
results = run_pipeline('missing_names.xml.gz')
assert len(results) == 7
assert all(r[4] == 'short_tandem_repeat_expansion' for r in results)
assert all(r[3] == 'short_tandem_repeat_expansion' for r in results)


def test_missing_names_and_hgvs():
"""Records that are missing variant names and HGVS should use coordinate span from alleles instead."""
assert sorted(run_pipeline('missing_names_and_hgvs.xml.gz')) == [
# ref=T, alt=TGAAAGAAAGAAAGAAAGAAA => correctly classified as short tandem repeat
['RCV001355211', '1', 'ENSG00000109861', 'CTSC', 'short_tandem_repeat_expansion', '0'],
['RCV001355211', 'ENSG00000109861', 'CTSC', 'short_tandem_repeat_expansion'],
# ref=T, alt=TACACACACACAC => classified as trinucleotide repeat without repeating unit inference.
['RCV001356600', '1', 'ENSG00000136869', 'TLR4', 'trinucleotide_repeat_expansion', '0']
['RCV001356600', 'ENSG00000136869', 'TLR4', 'trinucleotide_repeat_expansion']
]
Loading