-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update consequence severity filtering #326
Changes from 7 commits
7eb3e33
621c975
77daa04
b9a873a
a22aa54
6ce5c46
0763189
9475ae3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,22 +15,13 @@ | |
from retry import retry | ||
|
||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument( | ||
'--enable-distant-querying', action='store_true', | ||
help='Enables a second iteration of querying VEP for distant gene variants, which is disabled by default' | ||
) | ||
parser.add_argument( | ||
'--report-distance', action='store_true', | ||
help='Report a distance to the gene for upstream/downstream gene variants. Disabled by default' | ||
) | ||
|
||
logging.basicConfig() | ||
logger = logging.getLogger('consequence_mapping') | ||
logger.setLevel(logging.INFO) | ||
|
||
# The "distance to the nearest gene" parameters, used to query VEP in first and second iterations, respectively. | ||
# The "distance to the nearest gene" parameters, used to query VEP. | ||
VEP_SHORT_QUERY_DISTANCE = 5000 | ||
VEP_LONG_QUERY_DISTANCE = 500000 | ||
|
||
|
||
def deduplicate_list(lst): | ||
|
@@ -54,7 +45,7 @@ def vep_id_to_colon_id(vep_id): | |
|
||
|
||
@retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger) | ||
def query_vep(variants, search_distance): | ||
def query_vep(variants, search_distance=VEP_SHORT_QUERY_DISTANCE): | ||
"""Query VEP and return results in JSON format. Upstream/downstream genes are searched up to a given distance.""" | ||
ensembl_request_url = 'https://rest.ensembl.org/vep/human/region' | ||
headers = {'Content-Type': 'application/json', 'Accept': 'application/json'} | ||
|
@@ -97,54 +88,58 @@ def load_consequence_severity_rank(): | |
return {term: index for index, term in enumerate(get_severity_ranking())} | ||
|
||
|
||
def extract_consequences(vep_results, acceptable_biotypes, only_closest, results_by_variant, report_distance=False): | ||
def extract_consequences(vep_results, acceptable_biotypes): | ||
"""Given VEP results, return a list of consequences matching certain criteria. | ||
|
||
Args: | ||
vep_results: results obtained from VEP for a list of variants, in JSON format. | ||
acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see | ||
https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why, among all biotypes, we only consider miRNAs and protein coding genes? Sorry again, has not much to do with this PR but I'm curious. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a good question and it seems to have been decided long ago (before the birth of this repo). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created #328 for this, we can discuss at our next meeting. |
||
will be ignored. | ||
only_closest: if this flag is specified, then at most one consequence per variant will be returned. The | ||
consequences are sorted by distance from the gene and the closest one is chosen. In case of a tie, | ||
consequence is selected at random. If this flag is not specified, all consequences for each variant will | ||
be returned. | ||
results_by_variant: a dict to write the results into. | ||
report_distance: if set to True, a distance from the variant to the gene it affects will be reported | ||
(applicable to upstream and downstream gene variants). Otherwise, the distance parameter will always be 0. | ||
""" | ||
consequence_term_severity_rank = load_consequence_severity_rank() | ||
results_by_variant = defaultdict(list) | ||
for result in vep_results: | ||
variant_identifier = result['input'] | ||
results_by_variant.setdefault(variant_identifier, []) | ||
consequences = result.get('transcript_consequences', []) | ||
|
||
# Keep only consequences with the allowed biotypes; if there are none, skip this variant | ||
consequences = [c for c in consequences if c['biotype'] in acceptable_biotypes] | ||
if not consequences: | ||
continue | ||
|
||
# Flatten the list of consequence terms and find the most severe one | ||
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']] | ||
all_consequence_terms.sort(key=lambda term: consequence_term_severity_rank[term]) | ||
most_severe_consequence_term = all_consequence_terms[0] | ||
|
||
# Keep only consequences which include the most severe consequence term; sort by increasing order of distance. | ||
# If there is no 'distance' attribute in VEP results, it means that it is not applicable as the variant resides | ||
# *inside* the gene; hence, in this case the distance is set to 0. | ||
consequences = [c for c in consequences if most_severe_consequence_term in c['consequence_terms']] | ||
consequences.sort(key=lambda consequence: abs(consequence.get('distance', 0))) | ||
|
||
# If mandated by a flag, keep only one (least distant) consequence | ||
if only_closest: | ||
consequences = [consequences[0]] | ||
|
||
# Return a subset of fields (required for output) of filtered consequences | ||
results_by_variant[variant_identifier].extend([ | ||
(variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term, | ||
c.get('distance', 0) if report_distance else 0) | ||
for c in consequences | ||
]) | ||
# If there is no 'distance' attribute in VEP results, it means the variant overlaps the gene. | ||
overlapping_consequences = [c for c in consequences if 'distance' not in c] | ||
|
||
# For genes overlapping the variant, we report the most severe consequence per gene. | ||
if overlapping_consequences: | ||
consequences_per_gene = defaultdict(list) | ||
for c in overlapping_consequences: | ||
key = (c['gene_id'], c.get('gene_symbol', '')) | ||
consequences_per_gene[key].extend(term for term in c['consequence_terms']) | ||
for (gene_id, gene_symbol), terms in consequences_per_gene.items(): | ||
most_severe_consequence_term = min(terms, key=lambda term: consequence_term_severity_rank[term]) | ||
results_by_variant[variant_identifier].append( | ||
(variant_identifier, gene_id, gene_symbol, most_severe_consequence_term) | ||
) | ||
apriltuesday marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes | ||
# associated with that consequence | ||
else: | ||
# Flatten the list of consequence terms and find the most severe one | ||
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']] | ||
all_consequence_terms.sort(key=lambda term: consequence_term_severity_rank[term]) | ||
most_severe_consequence_term = all_consequence_terms[0] | ||
|
||
# Keep only consequences which include the most severe consequence term. | ||
consequences = [c for c in consequences if most_severe_consequence_term in c['consequence_terms']] | ||
consequences.sort(key=lambda consequence: abs(consequence.get('distance', 0))) | ||
|
||
# Return a subset of fields (required for output) of filtered consequences | ||
results_by_variant[variant_identifier].extend([ | ||
(variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term) | ||
for c in consequences | ||
]) | ||
|
||
return results_by_variant | ||
|
||
|
@@ -158,50 +153,21 @@ def get_variants_without_consequences(results_by_variant): | |
}) | ||
|
||
|
||
def process_variants(variants, enable_distant_querying=False, report_distance=False): | ||
def process_variants(variants): | ||
"""Given a list of variant IDs, return a list of consequence types (each including Ensembl gene name & ID and a | ||
functional consequence code) for a given variant. | ||
|
||
Args: | ||
enable_distant_querying: If set to True, an additional VEP query will be performed for variants for which no | ||
consequences were found during the first iteration, in an attempt to find distant variant consequences. | ||
report_distance: Whether to report distance to the nearest gene for upstream and downstream gene variants. | ||
See extract_consequences() for details. | ||
""" | ||
|
||
# First, we query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts | ||
# Query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts | ||
# up to a standard distance (5000 nucleotides either way, which is default for VEP) from the variant. | ||
results_by_variant = {} | ||
vep_results = query_vep(variants=variants, search_distance=VEP_SHORT_QUERY_DISTANCE) | ||
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'}, | ||
only_closest=False, results_by_variant=results_by_variant, | ||
report_distance=report_distance) | ||
vep_results = query_vep(variants=variants) | ||
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'}) | ||
|
||
# See if there are variants with no consequences up to the default distance | ||
variants_without_consequences = get_variants_without_consequences(results_by_variant) | ||
if variants_without_consequences: | ||
logger.info('Found {} variant(s) without standard consequences: {}'.format( | ||
len(variants_without_consequences), '|'.join(variants_without_consequences))) | ||
|
||
if enable_distant_querying: | ||
logger.info('Attempting to find distant consequences for the remaining variants') | ||
|
||
# If there are, we will now do a second round of querying, this time looking only at protein coding biotypes | ||
# (vs. miRNA *and* protein coding during the first round) up to a distance of 500,000 bases each way. | ||
if variants_without_consequences: | ||
distant_vep_results = query_vep(variants=variants_without_consequences, | ||
search_distance=VEP_LONG_QUERY_DISTANCE) | ||
extract_consequences(vep_results=distant_vep_results, acceptable_biotypes={'protein_coding'}, | ||
only_closest=True, results_by_variant=results_by_variant, | ||
report_distance=report_distance) | ||
|
||
# See if there are still variants with no consequences, even up to a wide search window | ||
variants_without_consequences = get_variants_without_consequences(results_by_variant) | ||
if variants_without_consequences: | ||
logger.info('After distant querying, still remaining {} variant(s) without consequences: {}'.format( | ||
len(variants_without_consequences), '|'.join(variants_without_consequences) | ||
)) | ||
|
||
# Yield all consequences for all variants. Note they are not grouped by variant, all consequences are yielded in a | ||
# common sequence. | ||
for variant_id, variant_consequences in results_by_variant.items(): | ||
|
@@ -217,12 +183,9 @@ def main(): | |
variants_to_query = [colon_based_id_to_vep_id(v) for v in sys.stdin.read().splitlines()] | ||
|
||
# Query VEP with all variants at once (for the purpose of efficiency), print out the consequences to STDOUT. | ||
consequences = process_variants(variants_to_query, | ||
enable_distant_querying=args.enable_distant_querying, | ||
report_distance=args.report_distance) | ||
for variant_id, gene_id, gene_symbol, consequence_term, distance in consequences: | ||
# The second column, set statically to 1, is not used, and is maintained for compatibility purposes | ||
print('\t'.join([vep_id_to_colon_id(variant_id), '1', gene_id, gene_symbol, consequence_term, str(distance)])) | ||
consequences = process_variants(variants_to_query) | ||
for variant_id, gene_id, gene_symbol, consequence_term in consequences: | ||
print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term])) | ||
|
||
logger.info('Successfully processed {} variants'.format(len(variants_to_query))) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What was the reason behind no longer querying VEP twice if the short distance doesn't find any gene? Just curious