code rearrangement

RobinM-code · Mar 10, 2020 · 02b2a7b · 02b2a7b
1 parent 196c1fb
commit 02b2a7b
Show file tree

Hide file tree

Showing 9 changed files with 51 additions and 231 deletions.
diff --git a/compute_ldscores.py b/compute_ldscores.py
@@ -5,8 +5,9 @@
 import time
 from ldsc_polyfun import ldscore, parse
 import logging
-from tqdm import tqdm
 from pandas.api.types import is_numeric_dtype
+from polyfun_utils import configure_logger
+
 
 
 def __filter__(fname, noun, verb, merge_obj):
@@ -28,31 +29,6 @@ def __filter__(fname, noun, verb, merge_obj):
         return merged_list
 
 
-
-class TqdmHandler(logging.StreamHandler):
-    def __init__(self):
-        logging.StreamHandler.__init__(self)
-
-    def emit(self, record):
-        msg = self.format(record)
-        tqdm.write(msg)
-
-
-def configure_logger(out_prefix):
-
-    logFormatter = logging.Formatter("[%(levelname)s]  %(message)s")
-    logger = logging.getLogger()
-    logger.setLevel(logging.NOTSET)
-
-    consoleHandler = TqdmHandler()
-    consoleHandler.setFormatter(logFormatter)
-    logger.addHandler(consoleHandler)
-
-    fileHandler = logging.FileHandler(out_prefix+'.log')
-    fileHandler.setFormatter(logFormatter)
-    logger.addHandler(fileHandler)
-
-
 
 def compute_ldscores(args):
 

diff --git a/compute_ldscores_ukb.py b/compute_ldscores_ukb.py
@@ -10,6 +10,7 @@
 import tempfile
 import scipy.sparse as sparse
 from pandas.api.types import is_numeric_dtype
+from polyfun_utils import configure_logger, set_snpid_index
 
 
 UKBB_LD_URL = 'https://data.broadinstitute.org/alkesgroup/UKBB_LD'
@@ -18,14 +19,7 @@
 META_COLUMNS = ['SNP', 'CHR', 'BP', 'A1', 'A2']
 
 
-class TqdmHandler(logging.StreamHandler):
-    def __init__(self):
-        logging.StreamHandler.__init__(self)
 
-    def emit(self, record):
-        msg = self.format(record)
-        tqdm.write(msg)
-
 
 class TqdmUpTo(tqdm):
     """
@@ -36,35 +30,6 @@ def update_to(self, b=1, bsize=1, tsize=None):
         if tsize is not None: self.total = tsize            
         self.update(b * bsize - self.n)
 
-
-def configure_logger(out_prefix):
-
-    logFormatter = logging.Formatter("[%(levelname)s]  %(message)s")
-    logger = logging.getLogger()
-    logger.setLevel(logging.NOTSET)
-
-    consoleHandler = TqdmHandler()
-    consoleHandler.setFormatter(logFormatter)
-    logger.addHandler(consoleHandler)
-
-    fileHandler = logging.FileHandler(out_prefix+'.log')
-    fileHandler.setFormatter(logFormatter)
-    logger.addHandler(fileHandler)
-
-
-def set_snpid_index(df, copy=False):
-    if copy:
-        df = df.copy()
-    df['A1_first'] = (df['A1'] < df['A2']) | (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
-    df['A1s'] = df['A2'].copy()
-    df.loc[df['A1_first'], 'A1s'] = df.loc[df['A1_first'], 'A1'].copy()
-    df['A2s'] = df['A1'].copy()
-    df.loc[df['A1_first'], 'A2s'] = df.loc[df['A1_first'], 'A2'].copy()
-    df.index = df['CHR'].astype(str) + '.' + df['BP'].astype(str) + '.' + df['A1s'] + '.' + df['A2s']
-    df.index.name = 'snpid'
-    df.drop(columns=['A1_first', 'A1s', 'A2s'], inplace=True)
-    return df
-
 
 def read_annot(annot_file):
     try:

diff --git a/example_data/chr1.finemap_sumstats.txt.gz b/example_data/chr1.finemap_sumstats.txt.gz
diff --git a/extract_snpvar.py b/extract_snpvar.py
@@ -2,14 +2,9 @@
 import numpy as np
 import os
 import logging
-from polyfun import configure_logger
 from pyarrow import ArrowIOError
+from polyfun_utils import check_package_versions, configure_logger
 
-def check_package_versions():
-    from pkg_resources import parse_version
-    if parse_version(pd.__version__) < parse_version('0.25.0'):
-        raise ValueError('your pandas version is too old --- please update pandas')
-
 
 if __name__ == '__main__':
 

diff --git a/finemapper.py b/finemapper.py
@@ -10,20 +10,7 @@
 import glob
 import subprocess
 from importlib import reload
-
-
-def set_snpid_index(df, copy=False):
-    if copy:
-        df = df.copy()
-    df['A1_first'] = (df['A1'] < df['A2']) | (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
-    df['A1s'] = df['A2'].copy()
-    df.loc[df['A1_first'], 'A1s'] = df.loc[df['A1_first'], 'A1'].copy()
-    df['A2s'] = df['A1'].copy()
-    df.loc[df['A1_first'], 'A2s'] = df.loc[df['A1_first'], 'A2'].copy()
-    df.index = df['CHR'].astype(str) + '.' + df['BP'].astype(str) + '.' + df['A1s'] + '.' + df['A2s']
-    df.index.name = 'snpid'
-    df.drop(columns=['A1_first', 'A1s', 'A2s'], inplace=True)
-    return df
+from polyfun_utils import set_snpid_index
 
 
 
@@ -117,15 +104,24 @@ def __init__(self, genotypes_file, sumstats_file, n, chr_num, ldstore_exe,
         self.chr = chr_num
 
 
-    def sync_ld_sumstats(self, ld, df_ld_snps):
+    def sync_ld_sumstats(self, ld, df_ld_snps, allow_missing=False):
         df_ld_snps = set_snpid_index(df_ld_snps)
         assert ld.shape[0] == df_ld_snps.shape[0]
         assert ld.shape[0] == ld.shape[1]
         df_ld = pd.DataFrame(ld, index=df_ld_snps.index, columns=df_ld_snps.index)
 
         #make sure that all SNPs in the sumstats file are in the LD file
         if not np.all(self.df_sumstats_locus.index.isin(df_ld.index)):
-            raise ValueError('not all SNPs in the sumstats file were found in the LD matrix')
+            if allow_missing:
+                num_missing = np.sum(~self.df_sumstats_locus.index.isin(df_ld.index))
+                logging.warning('%d variants with sumstats were not found in the LD file and will be omitted (please note that this may lead to false positives if the omitted SNPs are causal!)'%(num_missing))            
+                self.df_sumstats_locus = self.df_sumstats_locus.loc[self.df_sumstats_locus.index.isin(df_ld.index)]
+                assert np.all(self.df_sumstats_locus.index.isin(df_ld.index))
+            else:
+                error_msg = ('not all SNPs in the sumstats file were found in the LD matrix!'
+                            'You could drop the missing SNPs with the flag --allow-missing, but please note that'
+                            ' these omitted SNPs may be causal, in which case you may get false positive results...')
+                raise ValueError(error_msg)
 
         #filter LD to only SNPs found in the sumstats file
         assert not np.any(self.df_sumstats_locus.index.duplicated())
@@ -142,7 +138,7 @@ def sync_ld_sumstats(self, ld, df_ld_snps):
         self.df_ld_snps = df_ld_snps
 
 
-    def set_locus(self, locus_start, locus_end, extract_ld=True, read_ld_matrix=False, verbose=False):
+    def set_locus(self, locus_start, locus_end, extract_ld=True, read_ld_matrix=False, verbose=False, allow_missing=False):
 
         #update self.df_sumstats_locus
         self.df_sumstats_locus = self.df_sumstats.query('%d <= BP <= %d'%(locus_start, locus_end))
@@ -248,7 +244,16 @@ def set_locus(self, locus_start, locus_end, extract_ld=True, read_ld_matrix=Fals
             df_ld_snps.rename(columns={'RSID':'SNP', 'position':'BP', 'chromosome':'CHR', 'A_allele':'A1', 'B_allele':'A2'}, inplace=True, errors='raise')
             df_ld_snps = set_snpid_index(df_ld_snps)
             if not np.all(self.df_sumstats_locus.index.isin(df_ld_snps.index)):
-                raise IOError('Not all variants exist in LDStore output')
+                if allow_missing:
+                    num_missing = np.sum(~self.df_sumstats_locus.index.isin(df_ld_snps.index))
+                    logging.warning('%d variants with sumstats were not found in the LD file and will be omitted (please note that this may lead to false positives if the omitted SNPs are causal!)'%(num_missing))
+                    self.df_sumstats_locus = self.df_sumstats_locus.loc[self.df_sumstats_locus.index.isin(df_ld_snps.index)]
+                    assert np.all(self.df_sumstats_locus.index.isin(df_ld_snps.index))
+                else:
+                    error_msg = ('not all SNPs in the sumstats file were found in the LD matrix!'
+                                'You could drop the missing SNPs with the flag --allow-missing, but please note that'
+                                ' these omitted SNPs may be causal, in which case you may get false positive results...')
+                    raise ValueError(error_msg)
 
             #create incl-variants file if needed
             if df_ld_snps.shape[0] == self.df_sumstats_locus.shape[0]:
@@ -375,7 +380,7 @@ def __init__(self, genotypes_file, sumstats_file, n, chr_num, ldstore_exe, sampl
 
 
 
-    def finemap(self, locus_start, locus_end, num_causal_snps, use_prior_causal_prob=True, prior_var=None, residual_var=None, hess=False, verbose=False, ld=None, df_ld_snps=None, debug_dir=None):
+    def finemap(self, locus_start, locus_end, num_causal_snps, use_prior_causal_prob=True, prior_var=None, residual_var=None, hess=False, verbose=False, ld=None, df_ld_snps=None, debug_dir=None, allow_missing=False):
 
         #check params
         if use_prior_causal_prob and 'SNPVAR' not in self.df_sumstats.columns:
@@ -384,9 +389,9 @@ def finemap(self, locus_start, locus_end, num_causal_snps, use_prior_causal_prob
             raise ValueError('either both or none of ld, df_ld_SNPs should be specified')
 
         #set locus
-        self.set_locus(locus_start, locus_end, read_ld_matrix=True, verbose=verbose, extract_ld=(ld is None))
+        self.set_locus(locus_start, locus_end, read_ld_matrix=True, verbose=verbose, extract_ld=(ld is None), allow_missing=allow_missing)
         if ld is not None:
-            self.sync_ld_sumstats(ld, df_ld_snps)
+            self.sync_ld_sumstats(ld, df_ld_snps, allow_missing=allow_missing)
 
         #define prior causal probabilities
         if use_prior_causal_prob:

diff --git a/munge_polyfun_sumstats.py b/munge_polyfun_sumstats.py
@@ -5,28 +5,7 @@
 import scipy.stats as stats
 import logging
 from pandas.api.types import is_integer_dtype
-
-def check_package_versions():
-    from pkg_resources import parse_version
-    if parse_version(pd.__version__) < parse_version('0.25.0'):
-        raise ValueError('your pandas version is too old --- please update pandas')
-
-
-
-def configure_logger(out_prefix):
-
-    logFormatter = logging.Formatter("[%(levelname)s]  %(message)s")
-    logger = logging.getLogger()
-    logger.setLevel(logging.NOTSET)
-
-    consoleHandler = logging.StreamHandler()
-    consoleHandler.setFormatter(logFormatter)
-    logger.addHandler(consoleHandler)
-
-    fileHandler = logging.FileHandler(out_prefix+'.log')
-    fileHandler.setFormatter(logFormatter)
-    logger.addHandler(fileHandler)
-
+from polyfun_utils import check_package_versions, configure_logger
 
 
 def compute_Neff(df_sumstats, n, chi2_cutoff=30):
@@ -199,6 +178,16 @@ def sanity_checks(df_sumstats):
         raise ValueError('Some chromosome values are not integers. Please double-check your input')
     if not is_integer_dtype(df_sumstats['BP']):
         raise ValueError('Some base-pair values are not integers. Please double-check your input')
+
+    #check for duplicates
+    df_snp = df_sumstats['CHR'].astype('str') + '.' + \
+             df_sumstats['BP'].astype('str') + '.' + \
+             df_sumstats['A1'].astype('str') + '.' + \
+             df_sumstats['A2'].astype('str')
+    if np.any(df_snp.duplicated()):
+        raise ValueError('The input file includes duplicate SNPs')
+
+    #compute Z    
 
 
 def convert_odds_ratio_to_log(df_sumstats):