added support for genome-wide fine-mapping

RobinM-code · Jul 21, 2020 · 3e52731 · 3e52731
1 parent 5ce89e0
commit 3e52731
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 7 deletions.
diff --git a/aggregate_finemapper_results.py b/aggregate_finemapper_results.py
@@ -0,0 +1,85 @@
+import numpy as np; np.set_printoptions(precision=4, linewidth=200)
+import pandas as pd; pd.set_option('display.width', 200)
+import os
+import logging
+from tqdm import tqdm
+from polyfun import configure_logger, check_package_versions
+from polyfun_utils import set_snpid_index
+from pyarrow import ArrowIOError
+from pyarrow.lib import ArrowInvalid
+from polyfun_utils import DEFAULT_REGIONS_FILE
+
+
+def main(args):
+
+    #read sumstats file
+    try:
+        df_sumstats = pd.read_parquet(args.sumstats)
+    except (ArrowIOError, ArrowInvalid):
+        df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
+
+    #read regions file
+    df_regions = pd.read_table(args.regions_file)
+    df_regions = df_regions.loc[df_regions.apply(lambda r: np.any((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)]
+
+    #aggregate outputs
+    df_sumstats_list = []
+    logging.info('Aggregating results...')
+    for _, r in tqdm(df_regions.iterrows()):
+        chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX']
+        output_file_r = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end)
+        if not os.path.exists(output_file_r):
+            err_msg = 'output file for chromosome %d bp %d-%d doesn\'t exist'%(chr_num, start, end)
+            if args.allow_missing_jobs:
+                logging.warning(err_msg)
+                continue
+            else:
+                raise IOError(err_msg)
+        df_sumstats_r = pd.read_table(output_file_r)
+
+        #mark distance from center
+        middle = (start+end)//2
+        df_sumstats_r['DISTANCE_FROM_CENTER'] = np.abs(df_sumstats_r['BP'] - middle)
+        df_sumstats_list.append(df_sumstats_r)
+
+    #keep only the most central result for each SNP
+    df_sumstats = pd.concat(df_sumstats_list, axis=0)
+    df_sumstats.sort_values('DISTANCE_FROM_CENTER', inplace=True, ascending=True)
+    df_sumstats = set_snpid_index(df_sumstats, allow_duplicates=True)
+    df_sumstats = df_sumstats.loc[~df_sumstats.index.duplicated(keep='first')]
+    del df_sumstats['DISTANCE_FROM_CENTER']
+    df_sumstats.sort_values(['CHR', 'BP'], inplace=True, ascending=True)
+
+    #write output file
+    df_sumstats.to_csv(args.out, sep='\t', index=False)
+    logging.info('Wrote aggregated results to %s'%(args.out))
+
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+
+    #general parameters
+    parser.add_argument('--sumstats', required=True, help='Name of sumstats file')
+    parser.add_argument('--out-prefix', required=True, help='prefix of output files')
+    parser.add_argument('--out', required=True, help='name of the aggregated output files')
+    parser.add_argument('--allow-missing-jobs', default=False, action='store_true', help='whether to allow missing jobs')
+    parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs')
+
+    #check package versions
+    check_package_versions()
+
+    #extract args
+    args = parser.parse_args()
+
+    #check that the output directory exists
+    if len(os.path.dirname(args.out))>0 and not os.path.exists(os.path.dirname(args.out)):
+        raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out)))    
+
+    #configure logger
+    configure_logger(args.out_prefix)
+
+    #invoke main function
+    main(args)
+
diff --git a/create_finemapper_jobs.py b/create_finemapper_jobs.py
@@ -0,0 +1,101 @@
+import numpy as np; np.set_printoptions(precision=4, linewidth=200)
+import pandas as pd; pd.set_option('display.width', 200)
+import os
+import logging
+from polyfun import configure_logger, check_package_versions
+from pyarrow import ArrowIOError
+from pyarrow.lib import ArrowInvalid
+from polyfun_utils import DEFAULT_REGIONS_FILE
+
+
+FINEMAPPER_SCRIPT = 'finemapper.py'
+
+
+def create_finemapper_cmd(args, chr_num, start, end, url_prefix):
+
+    output_file = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end)
+    cmd = '%s %s --chr %s --start %s --end %s --out %s'%(args.python, FINEMAPPER_SCRIPT, chr_num, start, end, output_file)
+    if args.max_num_causal>1:
+        cmd += ' --ld %s'%(url_prefix)
+
+    #add command line arguments
+    for key, value in vars(args).items():
+        if key in ['python', 'regions_file', 'out_prefix', 'jobs_file']: continue
+        key = key.replace('_', '-')
+        if type(value)==bool:
+            if value:
+                cmd += ' --%s'%(key)                
+        elif value is not None:
+            cmd += ' --%s %s'%(key, value)
+
+    return cmd
+
+
+def main(args):
+
+    #read sumstats file
+    try:
+        df_sumstats = pd.read_parquet(args.sumstats)
+    except (ArrowIOError, ArrowInvalid):
+        df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
+
+    #read regions file
+    df_regions = pd.read_table(args.regions_file)
+    df_regions = df_regions.loc[df_regions.apply(lambda r: np.any((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)]
+
+    #create jobs
+    with open(args.jobs_file, 'w') as f:
+        for _, r in df_regions.iterrows():
+            chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX']
+            cmd = create_finemapper_cmd(args, chr_num, start, end, url_prefix)
+            f.write(cmd + '\n')
+
+    logging.info('Wrote fine-mapping commands to %s'%(args.jobs_file))
+
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+
+    #general parameters
+    parser.add_argument('--method', required=True, help='Fine-mapping method (currently susie and finemap are supported)')
+    parser.add_argument('--sumstats', required=True, help='Name of sumstats file')
+    parser.add_argument('--n', required=True, type=int, help='Sample size')
+
+    #LDstore related parameters
+    parser.add_argument('--finemap-exe', default=None, help='Path to FINEMAP v1.4 executable file')
+    parser.add_argument('--memory', type=int, default=1, help='Maximum amount of memory in GB to allocate to LDStore')
+    parser.add_argument('--threads', type=int, default=None, help='The number of CPU cores LDstore will use (if not specified, LDstore will use the max number of CPU cores available')
+
+    parser.add_argument('--max-num-causal', required=True, type=int, help='Number of causal SNPs')
+    parser.add_argument('--non-funct', action='store_true', default=False, help='Perform non-functionally informed fine-mapping')
+    parser.add_argument('--hess', action='store_true', default=False, help='If specified, estimate causal effect variance via HESS')
+    parser.add_argument('--verbose', action='store_true', default=False, help='If specified, show verbose output')
+    parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, SNPs with sumstats that are not \
+                            found in the LD panel will be omitted. This is not recommended, because the omitted SNPs may be causal,\
+                            which could lead to false positive results')
+
+    parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs')
+    parser.add_argument('--python', default='python3', help='python3 executable')
+    parser.add_argument('--out-prefix', required=True, help='prefix of the output files')
+    parser.add_argument('--jobs-file', required=True, help='name of file with fine-mapping commands')
+
+    #check package versions
+    check_package_versions()
+
+    #extract args
+    args = parser.parse_args()
+
+    #check that the output directory exists
+    if len(os.path.dirname(args.out_prefix))>0 and not os.path.exists(os.path.dirname(args.out_prefix)):
+        raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out_prefix)))    
+    if len(os.path.dirname(args.jobs_file))>0 and not os.path.exists(os.path.dirname(args.jobs_file)):
+        raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.jobs_file)))    
+
+    #configure logger
+    configure_logger(args.out_prefix)
+
+    #invoke main function
+    main(args)
+
diff --git a/polyfun_utils.py b/polyfun_utils.py
@@ -7,9 +7,24 @@
 
 
 SNP_COLUMNS = ['CHR', 'SNP', 'BP', 'A1', 'A2']
+LONG_RANGE_LD_REGIONS = []
+LONG_RANGE_LD_REGIONS.append({'chr':6, 'start':25500000, 'end':33500000})
+LONG_RANGE_LD_REGIONS.append({'chr':8, 'start':8000000, 'end':12000000})
+LONG_RANGE_LD_REGIONS.append({'chr':11, 'start':46000000, 'end':57000000})
+DEFAULT_REGIONS_FILE = 'ukb_regions.tsv.gz'
 
 
 
+class TqdmUpTo(tqdm):
+    """
+        taken from: https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py
+    """
+
+    def update_to(self, b=1, bsize=1, tsize=None):
+        if tsize is not None: self.total = tsize            
+        self.update(b * bsize - self.n)
+
+
 
 ''' Logger class (for compatability with LDSC code)'''
 class Logger(object):
@@ -39,7 +54,7 @@ def check_package_versions():
         raise ValueError('\n\nPlease install the python package pandas_plink (using either "pip install pandas-plink" or "conda install -c conda-forge pandas-plink")\n\n')
 
 
-def set_snpid_index(df, copy=False):
+def set_snpid_index(df, copy=False, allow_duplicates=False):
     if copy:
         df = df.copy()
     is_indel = (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
@@ -53,12 +68,13 @@ def set_snpid_index(df, copy=False):
     df.drop(columns=['A1_first', 'A1s', 'A2s'], inplace=True)
 
     #check for duplicate SNPs
-    is_duplicate_snp = df.index.duplicated()
-    if np.any(is_duplicate_snp):
-        df_dup_snps = df.loc[is_duplicate_snp]
-        df_dup_snps = df_dup_snps.loc[~df_dup_snps.index.duplicated(), ['SNP', 'CHR', 'BP', 'A1', 'A2']]
-        error_msg = 'Duplicate SNPs were found in the input data:\n%s'%(df_dup_snps)
-        raise ValueError(error_msg)
+    if not allow_duplicates:
+        is_duplicate_snp = df.index.duplicated()
+        if np.any(is_duplicate_snp):
+            df_dup_snps = df.loc[is_duplicate_snp]
+            df_dup_snps = df_dup_snps.loc[~df_dup_snps.index.duplicated(), ['SNP', 'CHR', 'BP', 'A1', 'A2']]
+            error_msg = 'Duplicate SNPs were found in the input data:\n%s'%(df_dup_snps)
+            raise ValueError(error_msg)
     return df
 
 

diff --git a/ukb_regions.tsv.gz b/ukb_regions.tsv.gz