forked from omerwe/polyfun
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added support for genome-wide fine-mapping
- Loading branch information
Showing
4 changed files
with
209 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import numpy as np; np.set_printoptions(precision=4, linewidth=200) | ||
import pandas as pd; pd.set_option('display.width', 200) | ||
import os | ||
import logging | ||
from tqdm import tqdm | ||
from polyfun import configure_logger, check_package_versions | ||
from polyfun_utils import set_snpid_index | ||
from pyarrow import ArrowIOError | ||
from pyarrow.lib import ArrowInvalid | ||
from polyfun_utils import DEFAULT_REGIONS_FILE | ||
|
||
|
||
def main(args): | ||
|
||
#read sumstats file | ||
try: | ||
df_sumstats = pd.read_parquet(args.sumstats) | ||
except (ArrowIOError, ArrowInvalid): | ||
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True) | ||
|
||
#read regions file | ||
df_regions = pd.read_table(args.regions_file) | ||
df_regions = df_regions.loc[df_regions.apply(lambda r: np.any((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)] | ||
|
||
#aggregate outputs | ||
df_sumstats_list = [] | ||
logging.info('Aggregating results...') | ||
for _, r in tqdm(df_regions.iterrows()): | ||
chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX'] | ||
output_file_r = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end) | ||
if not os.path.exists(output_file_r): | ||
err_msg = 'output file for chromosome %d bp %d-%d doesn\'t exist'%(chr_num, start, end) | ||
if args.allow_missing_jobs: | ||
logging.warning(err_msg) | ||
continue | ||
else: | ||
raise IOError(err_msg) | ||
df_sumstats_r = pd.read_table(output_file_r) | ||
|
||
#mark distance from center | ||
middle = (start+end)//2 | ||
df_sumstats_r['DISTANCE_FROM_CENTER'] = np.abs(df_sumstats_r['BP'] - middle) | ||
df_sumstats_list.append(df_sumstats_r) | ||
|
||
#keep only the most central result for each SNP | ||
df_sumstats = pd.concat(df_sumstats_list, axis=0) | ||
df_sumstats.sort_values('DISTANCE_FROM_CENTER', inplace=True, ascending=True) | ||
df_sumstats = set_snpid_index(df_sumstats, allow_duplicates=True) | ||
df_sumstats = df_sumstats.loc[~df_sumstats.index.duplicated(keep='first')] | ||
del df_sumstats['DISTANCE_FROM_CENTER'] | ||
df_sumstats.sort_values(['CHR', 'BP'], inplace=True, ascending=True) | ||
|
||
#write output file | ||
df_sumstats.to_csv(args.out, sep='\t', index=False) | ||
logging.info('Wrote aggregated results to %s'%(args.out)) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
import argparse | ||
parser = argparse.ArgumentParser() | ||
|
||
#general parameters | ||
parser.add_argument('--sumstats', required=True, help='Name of sumstats file') | ||
parser.add_argument('--out-prefix', required=True, help='prefix of output files') | ||
parser.add_argument('--out', required=True, help='name of the aggregated output files') | ||
parser.add_argument('--allow-missing-jobs', default=False, action='store_true', help='whether to allow missing jobs') | ||
parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs') | ||
|
||
#check package versions | ||
check_package_versions() | ||
|
||
#extract args | ||
args = parser.parse_args() | ||
|
||
#check that the output directory exists | ||
if len(os.path.dirname(args.out))>0 and not os.path.exists(os.path.dirname(args.out)): | ||
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out))) | ||
|
||
#configure logger | ||
configure_logger(args.out_prefix) | ||
|
||
#invoke main function | ||
main(args) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import numpy as np; np.set_printoptions(precision=4, linewidth=200) | ||
import pandas as pd; pd.set_option('display.width', 200) | ||
import os | ||
import logging | ||
from polyfun import configure_logger, check_package_versions | ||
from pyarrow import ArrowIOError | ||
from pyarrow.lib import ArrowInvalid | ||
from polyfun_utils import DEFAULT_REGIONS_FILE | ||
|
||
|
||
FINEMAPPER_SCRIPT = 'finemapper.py' | ||
|
||
|
||
def create_finemapper_cmd(args, chr_num, start, end, url_prefix): | ||
|
||
output_file = '%s.chr%s.%s_%s.gz'%(args.out_prefix, chr_num, start, end) | ||
cmd = '%s %s --chr %s --start %s --end %s --out %s'%(args.python, FINEMAPPER_SCRIPT, chr_num, start, end, output_file) | ||
if args.max_num_causal>1: | ||
cmd += ' --ld %s'%(url_prefix) | ||
|
||
#add command line arguments | ||
for key, value in vars(args).items(): | ||
if key in ['python', 'regions_file', 'out_prefix', 'jobs_file']: continue | ||
key = key.replace('_', '-') | ||
if type(value)==bool: | ||
if value: | ||
cmd += ' --%s'%(key) | ||
elif value is not None: | ||
cmd += ' --%s %s'%(key, value) | ||
|
||
return cmd | ||
|
||
|
||
def main(args): | ||
|
||
#read sumstats file | ||
try: | ||
df_sumstats = pd.read_parquet(args.sumstats) | ||
except (ArrowIOError, ArrowInvalid): | ||
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True) | ||
|
||
#read regions file | ||
df_regions = pd.read_table(args.regions_file) | ||
df_regions = df_regions.loc[df_regions.apply(lambda r: np.any((df_sumstats['CHR']==r['CHR']) & (df_sumstats['BP'].between(r['START'], r['END']))), axis=1)] | ||
|
||
#create jobs | ||
with open(args.jobs_file, 'w') as f: | ||
for _, r in df_regions.iterrows(): | ||
chr_num, start, end, url_prefix = r['CHR'], r['START'], r['END'], r['URL_PREFIX'] | ||
cmd = create_finemapper_cmd(args, chr_num, start, end, url_prefix) | ||
f.write(cmd + '\n') | ||
|
||
logging.info('Wrote fine-mapping commands to %s'%(args.jobs_file)) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
import argparse | ||
parser = argparse.ArgumentParser() | ||
|
||
#general parameters | ||
parser.add_argument('--method', required=True, help='Fine-mapping method (currently susie and finemap are supported)') | ||
parser.add_argument('--sumstats', required=True, help='Name of sumstats file') | ||
parser.add_argument('--n', required=True, type=int, help='Sample size') | ||
|
||
#LDstore related parameters | ||
parser.add_argument('--finemap-exe', default=None, help='Path to FINEMAP v1.4 executable file') | ||
parser.add_argument('--memory', type=int, default=1, help='Maximum amount of memory in GB to allocate to LDStore') | ||
parser.add_argument('--threads', type=int, default=None, help='The number of CPU cores LDstore will use (if not specified, LDstore will use the max number of CPU cores available') | ||
|
||
parser.add_argument('--max-num-causal', required=True, type=int, help='Number of causal SNPs') | ||
parser.add_argument('--non-funct', action='store_true', default=False, help='Perform non-functionally informed fine-mapping') | ||
parser.add_argument('--hess', action='store_true', default=False, help='If specified, estimate causal effect variance via HESS') | ||
parser.add_argument('--verbose', action='store_true', default=False, help='If specified, show verbose output') | ||
parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, SNPs with sumstats that are not \ | ||
found in the LD panel will be omitted. This is not recommended, because the omitted SNPs may be causal,\ | ||
which could lead to false positive results') | ||
|
||
parser.add_argument('--regions-file', default=DEFAULT_REGIONS_FILE, help='name of file of regions and their URLs') | ||
parser.add_argument('--python', default='python3', help='python3 executable') | ||
parser.add_argument('--out-prefix', required=True, help='prefix of the output files') | ||
parser.add_argument('--jobs-file', required=True, help='name of file with fine-mapping commands') | ||
|
||
#check package versions | ||
check_package_versions() | ||
|
||
#extract args | ||
args = parser.parse_args() | ||
|
||
#check that the output directory exists | ||
if len(os.path.dirname(args.out_prefix))>0 and not os.path.exists(os.path.dirname(args.out_prefix)): | ||
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.out_prefix))) | ||
if len(os.path.dirname(args.jobs_file))>0 and not os.path.exists(os.path.dirname(args.jobs_file)): | ||
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.jobs_file))) | ||
|
||
#configure logger | ||
configure_logger(args.out_prefix) | ||
|
||
#invoke main function | ||
main(args) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.