Skip to content

Commit

Permalink
Merge pull request #31 from MRCIEU/mem
Browse files Browse the repository at this point in the history
Memory improvements
  • Loading branch information
Matthew Lyon authored Mar 11, 2020
2 parents 45d9ed8 + c01b201 commit c52e3c2
Show file tree
Hide file tree
Showing 12 changed files with 314 additions and 339 deletions.
39 changes: 13 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,45 +43,30 @@ Unit tests:

```
cd gwas2vcf
python -m unittest discover test
```

End-to-end tests:

```
bash test/example.sh
python -m pytest -v test
```

## Usage

```
Map GWAS summary statistics to VCF/BCF
usage: main.py [-h] [-v] [--out OUT] [--data DATA] --ref REF --json JSON
[--id ID] [--cohort_controls COHORT_CONTROLS]
[--cohort_cases COHORT_CASES] [--rm_chr_prefix] [--csi]
[--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
usage: main.py [-h] [-v] [--out OUT] [--data DATA] --ref REF --json JSON [--id ID] [--cohort_controls COHORT_CONTROLS]
[--cohort_cases COHORT_CASES] [--rm_chr_prefix] [--csi] [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
Map GWAS summary statistics to VCF/BCF
optional arguments:
-h, --help show this help message and exit
-v, --version show program's version number and exit
--out OUT Path to output VCF/BCF. If not present then must be
specified as 'out' in json file
--data DATA Path to GWAS summary stats. If not present then must
be specified as 'data' in json file
--out OUT Path to output VCF/BCF. If not present then must be specified as 'out' in json file
--data DATA Path to GWAS summary stats. If not present then must be specified as 'data' in json file
--ref REF Path to reference FASTA
--json JSON Path to parameters JSON
--id ID Study identifier. If not present then must be
specified as 'id' in json file
--id ID Study identifier. If not present then must be specified as 'id' in json file
--cohort_controls COHORT_CONTROLS
Total study number of controls (if case/control) or
total sample size if continuous. Overwrites value if
Total study number of controls (if case/control) or total sample size if continuous. Overwrites value if
present in json file.
--cohort_cases COHORT_CASES
Total study number of cases. Overwrites value if
present in json file.
Total study number of cases. Overwrites value if present in json file.
--rm_chr_prefix Remove chr prefix from GWAS chromosome
--csi Default is to index tbi but use this flag to index csi
--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}
Expand All @@ -90,9 +75,13 @@ optional arguments:

See `param.py` for JSON specification

## Example

See [gwas-vcf-performance](https://github.com/MRCIEU/gwas-vcf-performance/blob/master/workflow.Rmd) for a full implementation

## Combine multiallelics

Merge variants at single genetic position on to a single row
Merge variants at single genetic position on to a single row. This step is **highly** recommended to avoid duplicate RSIDs.

```
bcftools norm \
Expand Down Expand Up @@ -139,8 +128,6 @@ file.vcf.gz

## Merge multiple GWAS summary stats into a single file

Note: Merged GWAS BCFs are significantly slower to query; for best performance do not do this.

```
bcftools merge \
-O z \
Expand Down
123 changes: 90 additions & 33 deletions gwas_results.py → gwas.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
import gzip
import os
import tempfile
import pickle
import re
from heapq import heappush

""" Class to store single GWAS result i.e. single SNP or row"""


class GwasResult:
class Gwas:

def __init__(self, chrom, pos, ref, alt, b, se, pval, n, alt_freq, dbsnpid, ncase, imp_info, imp_z,
vcf_filter="PASS"):
Expand Down Expand Up @@ -40,16 +42,20 @@ def reverse_sign(self):
except TypeError:
self.alt_freq = None

def are_alleles_iupac(self):
for bp in self.alt:
if bp != 'A' and bp != 'T' and bp != 'C' and bp != 'G':
return False
def check_reference_allele(self, fasta):
assert self.ref == str(
fasta.fetch(region="{}:{}-{}".format(
self.chrom,
self.pos,
self.pos + len(self.ref) - 1
))
).upper()

def check_alleles_iupac(self):
for bp in self.alt:
assert bp in {"A", "T", "G", "C"}
for bp in self.ref:
if bp != 'A' and bp != 'T' and bp != 'C' and bp != 'G':
return False

return True
assert bp in {"A", "T", "G", "C"}

def __str__(self):
return str(self.__dict__)
Expand All @@ -59,6 +65,7 @@ def __str__(self):
@staticmethod
def read_from_file(
path,
fasta,
chrom_field,
pos_field,
ea_field,
Expand All @@ -69,7 +76,7 @@ def read_from_file(
delimiter,
header,
ncase_field=None,
dbsnp_field=None,
rsid_field=None,
ea_af_field=None,
nea_af_field=None,
imp_z_field=None,
Expand All @@ -78,6 +85,8 @@ def read_from_file(
rm_chr_prefix=False
):

rsid_pattern = re.compile("^rs[0-9]*$")

logging.info("Reading summary stats and mapping to FASTA: {}".format(path))
logging.debug("File path: {}".format(path))
logging.debug("CHR field: {}".format(chrom_field))
Expand All @@ -90,14 +99,21 @@ def read_from_file(
logging.debug("Delimiter: {}".format(delimiter))
logging.debug("Header: {}".format(header))
logging.debug("ncase Field: {}".format(ncase_field))
logging.debug("dbsnp Field: {}".format(dbsnp_field))
logging.debug("dbsnp Field: {}".format(rsid_field))
logging.debug("EA AF Field: {}".format(ea_af_field))
logging.debug("NEA AF Field: {}".format(nea_af_field))
logging.debug("IMP Z Score Field: {}".format(imp_z_field))
logging.debug("IMP INFO Field: {}".format(imp_info_field))
logging.debug("N Control Field: {}".format(ncontrol_field))

total_variants = 0
metadata = {
'TotalVariants': 0,
'VariantsNotRead': 0,
'HarmonisedVariants': 0,
'VariantsNotHarmonised': 0,
'SwitchedAlleles': 0
}
file_idx = dict()
filename, file_extension = os.path.splitext(path)

if file_extension == '.gz':
Expand All @@ -111,11 +127,11 @@ def read_from_file(
if header:
logging.info("Skipping header: {}".format(f.readline().strip()))

results = []
i = 0
for n, l in enumerate(f):
total_variants += 1
i+=1
# store results in a serialised temp file to reduce memory usage
results = tempfile.TemporaryFile()

for l in f:
metadata['TotalVariants'] += 1
s = l.strip().split(delimiter)

logging.debug("Input row: {}".format(s))
Expand All @@ -125,15 +141,17 @@ def read_from_file(
chrom = s[chrom_field].replace("chr", "")
else:
chrom = s[chrom_field]
except:
logging.warning("Skipping {}".format(s))
exit
except Exception as e:
logging.debug("Skipping {}: {}".format(s, e))
metadata['VariantsNotRead'] += 1
continue

try:
pos = int(float(s[pos_field])) # float is for scientific notation
assert pos > 0
except Exception as e:
logging.warning("Skipping {}: {}".format(s, e))
logging.debug("Skipping {}: {}".format(s, e))
metadata['VariantsNotRead'] += 1
continue

ref = s[nea_field].upper()
Expand All @@ -142,19 +160,22 @@ def read_from_file(
try:
b = float(s[effect_field])
except Exception as e:
logging.warning("Skipping {}: {}".format(s, e))
logging.debug("Skipping {}: {}".format(s, e))
metadata['VariantsNotRead'] += 1
continue

try:
se = float(s[se_field])
except Exception as e:
logging.warning("Skipping {}: {}".format(s, e))
logging.debug("Skipping {}: {}".format(s, e))
metadata['VariantsNotRead'] += 1
continue

try:
pval = float(s[pval_field])
except Exception as e:
logging.warning("Skipping line {}, {}".format(s, e))
logging.debug("Skipping line {}, {}".format(s, e))
metadata['VariantsNotRead'] += 1
continue

try:
Expand All @@ -169,10 +190,11 @@ def read_from_file(
alt_freq = None

try:
dbsnpid = s[dbsnp_field]
except (IndexError, TypeError, ValueError) as e:
rsid = s[rsid_field]
assert rsid_pattern.match(rsid)
except (IndexError, TypeError, ValueError, AssertionError) as e:
logging.debug("Could not parse dbsnp identifier: {}".format(e))
dbsnpid = None
rsid = None

try:
ncase = float(s[ncase_field])
Expand Down Expand Up @@ -204,7 +226,7 @@ def read_from_file(
logging.debug("Could not parse imputation Z score: {}".format(e))
imp_z = None

result = GwasResult(
result = Gwas(
chrom,
pos,
ref,
Expand All @@ -214,16 +236,51 @@ def read_from_file(
pval,
n,
alt_freq,
dbsnpid,
rsid,
ncase,
imp_info,
imp_z
)

logging.debug("Extracted row: {}".format(result))

results.append(result)
# check alleles
try:
result.check_alleles_iupac()
except AssertionError as e:
logging.debug("Skipping {}: {}".format(s, e))
metadata['VariantsNotRead'] += 1
continue

# harmonise alleles
try:
result.check_reference_allele(fasta)
except AssertionError:
try:
result.reverse_sign()
result.check_reference_allele(fasta)
metadata['SwitchedAlleles'] += 1
except AssertionError as e:
logging.debug("Could not harmonise {}: {}".format(s, e))
metadata['VariantsNotHarmonised'] += 1
continue
metadata['HarmonisedVariants'] += 1

# keep file position sorted by chromosome position for recall later
if result.chrom not in file_idx:
file_idx[result.chrom] = []
heappush(file_idx[result.chrom], (result.pos, results.tell()))
pickle.dump(result, results)

f.close()

return results, total_variants
logging.info("Total variants: {}".format(metadata['TotalVariants']))
logging.info("Variants could not be read: {}".format(metadata['VariantsNotRead']))
logging.info("Variants harmonised: {}".format(metadata['HarmonisedVariants']))
logging.info("Variants discarded during harmonisation: {}".format(metadata['VariantsNotHarmonised']))
logging.info("Alleles switched: {}".format(metadata['SwitchedAlleles']))
logging.info("Skipped {} of {}".format(
metadata['VariantsNotRead'] + metadata['VariantsNotHarmonised'], metadata['TotalVariants'])
)

return results, file_idx, metadata
55 changes: 0 additions & 55 deletions harmonise.py

This file was deleted.

Loading

0 comments on commit c52e3c2

Please sign in to comment.