Merge pull request #31 from MRCIEU/mem

Memory improvements
MRCIEU · Mar 11, 2020 · c52e3c2 · c52e3c2
2 parents 45d9ed8 + c01b201
commit c52e3c2
Show file tree

Hide file tree

Showing 12 changed files with 314 additions and 339 deletions.
diff --git a/README.md b/README.md
@@ -43,45 +43,30 @@ Unit tests:
 
 ```
 cd gwas2vcf
-python -m unittest discover test
-```
-
-End-to-end tests:
-
-```
-bash test/example.sh
+python -m pytest -v test
 ```
 
 ## Usage
 
 ```
-Map GWAS summary statistics to VCF/BCF
-
-usage: main.py [-h] [-v] [--out OUT] [--data DATA] --ref REF --json JSON
-               [--id ID] [--cohort_controls COHORT_CONTROLS]
-               [--cohort_cases COHORT_CASES] [--rm_chr_prefix] [--csi]
-               [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+usage: main.py [-h] [-v] [--out OUT] [--data DATA] --ref REF --json JSON [--id ID] [--cohort_controls COHORT_CONTROLS]
+               [--cohort_cases COHORT_CASES] [--rm_chr_prefix] [--csi] [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
 
 Map GWAS summary statistics to VCF/BCF
 
 optional arguments:
   -h, --help            show this help message and exit
   -v, --version         show program's version number and exit
-  --out OUT             Path to output VCF/BCF. If not present then must be
-                        specified as 'out' in json file
-  --data DATA           Path to GWAS summary stats. If not present then must
-                        be specified as 'data' in json file
+  --out OUT             Path to output VCF/BCF. If not present then must be specified as 'out' in json file
+  --data DATA           Path to GWAS summary stats. If not present then must be specified as 'data' in json file
   --ref REF             Path to reference FASTA
   --json JSON           Path to parameters JSON
-  --id ID               Study identifier. If not present then must be
-                        specified as 'id' in json file
+  --id ID               Study identifier. If not present then must be specified as 'id' in json file
   --cohort_controls COHORT_CONTROLS
-                        Total study number of controls (if case/control) or
-                        total sample size if continuous. Overwrites value if
+                        Total study number of controls (if case/control) or total sample size if continuous. Overwrites value if
                         present in json file.
   --cohort_cases COHORT_CASES
-                        Total study number of cases. Overwrites value if
-                        present in json file.
+                        Total study number of cases. Overwrites value if present in json file.
   --rm_chr_prefix       Remove chr prefix from GWAS chromosome
   --csi                 Default is to index tbi but use this flag to index csi
   --log {DEBUG,INFO,WARNING,ERROR,CRITICAL}
@@ -90,9 +75,13 @@ optional arguments:
 
 See `param.py` for JSON specification
 
+## Example
+
+See [gwas-vcf-performance](https://github.com/MRCIEU/gwas-vcf-performance/blob/master/workflow.Rmd) for a full implementation 
+
 ## Combine multiallelics
 
-Merge variants at single genetic position on to a single row
+Merge variants at single genetic position on to a single row. This step is **highly** recommended to avoid duplicate RSIDs. 
 
 ```
 bcftools norm \
@@ -139,8 +128,6 @@ file.vcf.gz
 
 ## Merge multiple GWAS summary stats into a single file
 
-Note: Merged GWAS BCFs are significantly slower to query; for best performance do not do this.
-
 ```
 bcftools merge \
 -O z \

diff --git a/gwas_results.py → gwas.py b/gwas_results.py → gwas.py
@@ -1,11 +1,13 @@
 import logging
 import gzip
 import os
+import tempfile
+import pickle
+import re
+from heapq import heappush
 
-""" Class to store single GWAS result i.e. single SNP or row"""
 
-
-class GwasResult:
+class Gwas:
 
     def __init__(self, chrom, pos, ref, alt, b, se, pval, n, alt_freq, dbsnpid, ncase, imp_info, imp_z,
                  vcf_filter="PASS"):
@@ -40,16 +42,20 @@ def reverse_sign(self):
         except TypeError:
             self.alt_freq = None
 
-    def are_alleles_iupac(self):
-        for bp in self.alt:
-            if bp != 'A' and bp != 'T' and bp != 'C' and bp != 'G':
-                return False
+    def check_reference_allele(self, fasta):
+        assert self.ref == str(
+            fasta.fetch(region="{}:{}-{}".format(
+                self.chrom,
+                self.pos,
+                self.pos + len(self.ref) - 1
+            ))
+        ).upper()
 
+    def check_alleles_iupac(self):
+        for bp in self.alt:
+            assert bp in {"A", "T", "G", "C"}
         for bp in self.ref:
-            if bp != 'A' and bp != 'T' and bp != 'C' and bp != 'G':
-                return False
-
-        return True
+            assert bp in {"A", "T", "G", "C"}
 
     def __str__(self):
         return str(self.__dict__)
@@ -59,6 +65,7 @@ def __str__(self):
     @staticmethod
     def read_from_file(
             path,
+            fasta,
             chrom_field,
             pos_field,
             ea_field,
@@ -69,7 +76,7 @@ def read_from_file(
             delimiter,
             header,
             ncase_field=None,
-            dbsnp_field=None,
+            rsid_field=None,
             ea_af_field=None,
             nea_af_field=None,
             imp_z_field=None,
@@ -78,6 +85,8 @@ def read_from_file(
             rm_chr_prefix=False
     ):
 
+        rsid_pattern = re.compile("^rs[0-9]*$")
+
         logging.info("Reading summary stats and mapping to FASTA: {}".format(path))
         logging.debug("File path: {}".format(path))
         logging.debug("CHR field: {}".format(chrom_field))
@@ -90,14 +99,21 @@ def read_from_file(
         logging.debug("Delimiter: {}".format(delimiter))
         logging.debug("Header: {}".format(header))
         logging.debug("ncase Field: {}".format(ncase_field))
-        logging.debug("dbsnp Field: {}".format(dbsnp_field))
+        logging.debug("dbsnp Field: {}".format(rsid_field))
         logging.debug("EA AF Field: {}".format(ea_af_field))
         logging.debug("NEA AF Field: {}".format(nea_af_field))
         logging.debug("IMP Z Score Field: {}".format(imp_z_field))
         logging.debug("IMP INFO Field: {}".format(imp_info_field))
         logging.debug("N Control Field: {}".format(ncontrol_field))
 
-        total_variants = 0
+        metadata = {
+            'TotalVariants': 0,
+            'VariantsNotRead': 0,
+            'HarmonisedVariants': 0,
+            'VariantsNotHarmonised': 0,
+            'SwitchedAlleles': 0
+        }
+        file_idx = dict()
         filename, file_extension = os.path.splitext(path)
 
         if file_extension == '.gz':
@@ -111,11 +127,11 @@ def read_from_file(
         if header:
             logging.info("Skipping header: {}".format(f.readline().strip()))
 
-        results = []
-        i = 0
-        for n, l in enumerate(f):
-            total_variants += 1
-            i+=1
+        # store results in a serialised temp file to reduce memory usage
+        results = tempfile.TemporaryFile()
+
+        for l in f:
+            metadata['TotalVariants'] += 1
             s = l.strip().split(delimiter)
 
             logging.debug("Input row: {}".format(s))
@@ -125,15 +141,17 @@ def read_from_file(
                     chrom = s[chrom_field].replace("chr", "")
                 else:
                     chrom = s[chrom_field]
-            except:
-                logging.warning("Skipping {}".format(s))
-                exit
+            except Exception as e:
+                logging.debug("Skipping {}: {}".format(s, e))
+                metadata['VariantsNotRead'] += 1
+                continue
 
             try:
                 pos = int(float(s[pos_field]))  # float is for scientific notation
                 assert pos > 0
             except Exception as e:
-                logging.warning("Skipping {}: {}".format(s, e))
+                logging.debug("Skipping {}: {}".format(s, e))
+                metadata['VariantsNotRead'] += 1
                 continue
 
             ref = s[nea_field].upper()
@@ -142,19 +160,22 @@ def read_from_file(
             try:
                 b = float(s[effect_field])
             except Exception as e:
-                logging.warning("Skipping {}: {}".format(s, e))
+                logging.debug("Skipping {}: {}".format(s, e))
+                metadata['VariantsNotRead'] += 1
                 continue
 
             try:
                 se = float(s[se_field])
             except Exception as e:
-                logging.warning("Skipping {}: {}".format(s, e))
+                logging.debug("Skipping {}: {}".format(s, e))
+                metadata['VariantsNotRead'] += 1
                 continue
 
             try:
                 pval = float(s[pval_field])
             except Exception as e:
-                logging.warning("Skipping line {}, {}".format(s, e))
+                logging.debug("Skipping line {}, {}".format(s, e))
+                metadata['VariantsNotRead'] += 1
                 continue
 
             try:
@@ -169,10 +190,11 @@ def read_from_file(
                 alt_freq = None
 
             try:
-                dbsnpid = s[dbsnp_field]
-            except (IndexError, TypeError, ValueError) as e:
+                rsid = s[rsid_field]
+                assert rsid_pattern.match(rsid)
+            except (IndexError, TypeError, ValueError, AssertionError) as e:
                 logging.debug("Could not parse dbsnp identifier: {}".format(e))
-                dbsnpid = None
+                rsid = None
 
             try:
                 ncase = float(s[ncase_field])
@@ -204,7 +226,7 @@ def read_from_file(
                 logging.debug("Could not parse imputation Z score: {}".format(e))
                 imp_z = None
 
-            result = GwasResult(
+            result = Gwas(
                 chrom,
                 pos,
                 ref,
@@ -214,16 +236,51 @@ def read_from_file(
                 pval,
                 n,
                 alt_freq,
-                dbsnpid,
+                rsid,
                 ncase,
                 imp_info,
                 imp_z
             )
 
             logging.debug("Extracted row: {}".format(result))
 
-            results.append(result)
+            # check alleles
+            try:
+                result.check_alleles_iupac()
+            except AssertionError as e:
+                logging.debug("Skipping {}: {}".format(s, e))
+                metadata['VariantsNotRead'] += 1
+                continue
+
+            # harmonise alleles
+            try:
+                result.check_reference_allele(fasta)
+            except AssertionError:
+                try:
+                    result.reverse_sign()
+                    result.check_reference_allele(fasta)
+                    metadata['SwitchedAlleles'] += 1
+                except AssertionError as e:
+                    logging.debug("Could not harmonise {}: {}".format(s, e))
+                    metadata['VariantsNotHarmonised'] += 1
+                    continue
+            metadata['HarmonisedVariants'] += 1
+
+            # keep file position sorted by chromosome position for recall later
+            if result.chrom not in file_idx:
+                file_idx[result.chrom] = []
+            heappush(file_idx[result.chrom], (result.pos, results.tell()))
+            pickle.dump(result, results)
 
         f.close()
 
-        return results, total_variants
+        logging.info("Total variants: {}".format(metadata['TotalVariants']))
+        logging.info("Variants could not be read: {}".format(metadata['VariantsNotRead']))
+        logging.info("Variants harmonised: {}".format(metadata['HarmonisedVariants']))
+        logging.info("Variants discarded during harmonisation: {}".format(metadata['VariantsNotHarmonised']))
+        logging.info("Alleles switched: {}".format(metadata['SwitchedAlleles']))
+        logging.info("Skipped {} of {}".format(
+            metadata['VariantsNotRead'] + metadata['VariantsNotHarmonised'], metadata['TotalVariants'])
+        )
+
+        return results, file_idx, metadata
diff --git a/harmonise.py b/harmonise.py