Skip to content

Commit

Permalink
added support for genome-wide fine-mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
omerwe committed Jul 21, 2020
1 parent 5ce89e0 commit 3e52731
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 7 deletions.
85 changes: 85 additions & 0 deletions aggregate_finemapper_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import numpy as np; np.set_printoptions(precision=4, linewidth=200)
import pandas as pd; pd.set_option('display.width', 200)
import os
import logging
from tqdm import tqdm
from polyfun import configure_logger, check_package_versions
from polyfun_utils import set_snpid_index
from pyarrow import ArrowIOError
from pyarrow.lib import ArrowInvalid
from polyfun_utils import DEFAULT_REGIONS_FILE


def main(args):

#read sumstats file
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)

#read regions file
df_regions = pd.read_table(args.regions_file)
df_regions = df_regions.loc[df_regions.apply(lambda r: np.any((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)]

#aggregate outputs
df_sumstats_list = []
logging.info('Aggregating results...')
for _, r in tqdm(df_regions.iterrows()):
chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX']
output_file_r = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end)
if not os.path.exists(output_file_r):
err_msg = 'output file for chromosome %d bp %d-%d doesn\'t exist'%(chr_num, start, end)
if args.allow_missing_jobs:
logging.warning(err_msg)
continue
else:
raise IOError(err_msg)
df_sumstats_r = pd.read_table(output_file_r)

#mark distance from center
middle = (start+end)//2
df_sumstats_r['DISTANCE_FROM_CENTER'] = np.abs(df_sumstats_r['BP'] - middle)
df_sumstats_list.append(df_sumstats_r)

#keep only the most central result for each SNP
df_sumstats = pd.concat(df_sumstats_list, axis=0)
df_sumstats.sort_values('DISTANCE_FROM_CENTER', inplace=True, ascending=True)
df_sumstats = set_snpid_index(df_sumstats, allow_duplicates=True)
df_sumstats = df_sumstats.loc[~df_sumstats.index.duplicated(keep='first')]
del df_sumstats['DISTANCE_FROM_CENTER']
df_sumstats.sort_values(['CHR', 'BP'], inplace=True, ascending=True)

#write output file
df_sumstats.to_csv(args.out, sep='\t', index=False)
logging.info('Wrote aggregated results to %s'%(args.out))



if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()

#general parameters
parser.add_argument('--sumstats', required=True, help='Name of sumstats file')
parser.add_argument('--out-prefix', required=True, help='prefix of output files')
parser.add_argument('--out', required=True, help='name of the aggregated output files')
parser.add_argument('--allow-missing-jobs', default=False, action='store_true', help='whether to allow missing jobs')
parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs')

#check package versions
check_package_versions()

#extract args
args = parser.parse_args()

#check that the output directory exists
if len(os.path.dirname(args.out))>0 and not os.path.exists(os.path.dirname(args.out)):
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out)))

#configure logger
configure_logger(args.out_prefix)

#invoke main function
main(args)

101 changes: 101 additions & 0 deletions create_finemapper_jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import numpy as np; np.set_printoptions(precision=4, linewidth=200)
import pandas as pd; pd.set_option('display.width', 200)
import os
import logging
from polyfun import configure_logger, check_package_versions
from pyarrow import ArrowIOError
from pyarrow.lib import ArrowInvalid
from polyfun_utils import DEFAULT_REGIONS_FILE


FINEMAPPER_SCRIPT = 'finemapper.py'


def create_finemapper_cmd(args, chr_num, start, end, url_prefix):

output_file = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end)
cmd = '%s %s --chr %s --start %s --end %s --out %s'%(args.python, FINEMAPPER_SCRIPT, chr_num, start, end, output_file)
if args.max_num_causal>1:
cmd += ' --ld %s'%(url_prefix)

#add command line arguments
for key, value in vars(args).items():
if key in ['python', 'regions_file', 'out_prefix', 'jobs_file']: continue
key = key.replace('_', '-')
if type(value)==bool:
if value:
cmd += ' --%s'%(key)
elif value is not None:
cmd += ' --%s %s'%(key, value)

return cmd


def main(args):

#read sumstats file
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)

#read regions file
df_regions = pd.read_table(args.regions_file)
df_regions = df_regions.loc[df_regions.apply(lambda r: np.any((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)]

#create jobs
with open(args.jobs_file, 'w') as f:
for _, r in df_regions.iterrows():
chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX']
cmd = create_finemapper_cmd(args, chr_num, start, end, url_prefix)
f.write(cmd + '\n')

logging.info('Wrote fine-mapping commands to %s'%(args.jobs_file))



if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()

#general parameters
parser.add_argument('--method', required=True, help='Fine-mapping method (currently susie and finemap are supported)')
parser.add_argument('--sumstats', required=True, help='Name of sumstats file')
parser.add_argument('--n', required=True, type=int, help='Sample size')

#LDstore related parameters
parser.add_argument('--finemap-exe', default=None, help='Path to FINEMAP v1.4 executable file')
parser.add_argument('--memory', type=int, default=1, help='Maximum amount of memory in GB to allocate to LDStore')
parser.add_argument('--threads', type=int, default=None, help='The number of CPU cores LDstore will use (if not specified, LDstore will use the max number of CPU cores available')

parser.add_argument('--max-num-causal', required=True, type=int, help='Number of causal SNPs')
parser.add_argument('--non-funct', action='store_true', default=False, help='Perform non-functionally informed fine-mapping')
parser.add_argument('--hess', action='store_true', default=False, help='If specified, estimate causal effect variance via HESS')
parser.add_argument('--verbose', action='store_true', default=False, help='If specified, show verbose output')
parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, SNPs with sumstats that are not \
found in the LD panel will be omitted. This is not recommended, because the omitted SNPs may be causal,\
which could lead to false positive results')

parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs')
parser.add_argument('--python', default='python3', help='python3 executable')
parser.add_argument('--out-prefix', required=True, help='prefix of the output files')
parser.add_argument('--jobs-file', required=True, help='name of file with fine-mapping commands')

#check package versions
check_package_versions()

#extract args
args = parser.parse_args()

#check that the output directory exists
if len(os.path.dirname(args.out_prefix))>0 and not os.path.exists(os.path.dirname(args.out_prefix)):
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out_prefix)))
if len(os.path.dirname(args.jobs_file))>0 and not os.path.exists(os.path.dirname(args.jobs_file)):
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.jobs_file)))

#configure logger
configure_logger(args.out_prefix)

#invoke main function
main(args)

30 changes: 23 additions & 7 deletions polyfun_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,24 @@


SNP_COLUMNS = ['CHR', 'SNP', 'BP', 'A1', 'A2']
LONG_RANGE_LD_REGIONS = []
LONG_RANGE_LD_REGIONS.append({'chr':6, 'start':25500000, 'end':33500000})
LONG_RANGE_LD_REGIONS.append({'chr':8, 'start':8000000, 'end':12000000})
LONG_RANGE_LD_REGIONS.append({'chr':11, 'start':46000000, 'end':57000000})
DEFAULT_REGIONS_FILE = 'ukb_regions.tsv.gz'



class TqdmUpTo(tqdm):
"""
taken from: https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py
"""

def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None: self.total = tsize
self.update(b * bsize - self.n)



''' Logger class (for compatability with LDSC code)'''
class Logger(object):
Expand Down Expand Up @@ -39,7 +54,7 @@ def check_package_versions():
raise ValueError('\n\nPlease install the python package pandas_plink (using either "pip install pandas-plink" or "conda install -c conda-forge pandas-plink")\n\n')


def set_snpid_index(df, copy=False):
def set_snpid_index(df, copy=False, allow_duplicates=False):
if copy:
df = df.copy()
is_indel = (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
Expand All @@ -53,12 +68,13 @@ def set_snpid_index(df, copy=False):
df.drop(columns=['A1_first', 'A1s', 'A2s'], inplace=True)

#check for duplicate SNPs
is_duplicate_snp = df.index.duplicated()
if np.any(is_duplicate_snp):
df_dup_snps = df.loc[is_duplicate_snp]
df_dup_snps = df_dup_snps.loc[~df_dup_snps.index.duplicated(), ['SNP', 'CHR', 'BP', 'A1', 'A2']]
error_msg = 'Duplicate SNPs were found in the input data:\n%s'%(df_dup_snps)
raise ValueError(error_msg)
if not allow_duplicates:
is_duplicate_snp = df.index.duplicated()
if np.any(is_duplicate_snp):
df_dup_snps = df.loc[is_duplicate_snp]
df_dup_snps = df_dup_snps.loc[~df_dup_snps.index.duplicated(), ['SNP', 'CHR', 'BP', 'A1', 'A2']]
error_msg = 'Duplicate SNPs were found in the input data:\n%s'%(df_dup_snps)
raise ValueError(error_msg)
return df


Expand Down
Binary file added ukb_regions.tsv.gz
Binary file not shown.

0 comments on commit 3e52731

Please sign in to comment.