Skip to content

Commit

Permalink
code rearrangement
Browse files Browse the repository at this point in the history
  • Loading branch information
omerwe committed Mar 10, 2020
1 parent 196c1fb commit 02b2a7b
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 231 deletions.
28 changes: 2 additions & 26 deletions compute_ldscores.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import time
from ldsc_polyfun import ldscore, parse
import logging
from tqdm import tqdm
from pandas.api.types import is_numeric_dtype
from polyfun_utils import configure_logger



def __filter__(fname, noun, verb, merge_obj):
Expand All @@ -28,31 +29,6 @@ def __filter__(fname, noun, verb, merge_obj):
return merged_list



class TqdmHandler(logging.StreamHandler):
def __init__(self):
logging.StreamHandler.__init__(self)

def emit(self, record):
msg = self.format(record)
tqdm.write(msg)


def configure_logger(out_prefix):

logFormatter = logging.Formatter("[%(levelname)s] %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.NOTSET)

consoleHandler = TqdmHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

fileHandler = logging.FileHandler(out_prefix+'.log')
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)



def compute_ldscores(args):

Expand Down
37 changes: 1 addition & 36 deletions compute_ldscores_ukb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import tempfile
import scipy.sparse as sparse
from pandas.api.types import is_numeric_dtype
from polyfun_utils import configure_logger, set_snpid_index


UKBB_LD_URL = 'https://data.broadinstitute.org/alkesgroup/UKBB_LD'
Expand All @@ -18,14 +19,7 @@
META_COLUMNS = ['SNP', 'CHR', 'BP', 'A1', 'A2']


class TqdmHandler(logging.StreamHandler):
def __init__(self):
logging.StreamHandler.__init__(self)

def emit(self, record):
msg = self.format(record)
tqdm.write(msg)


class TqdmUpTo(tqdm):
"""
Expand All @@ -36,35 +30,6 @@ def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None: self.total = tsize
self.update(b * bsize - self.n)


def configure_logger(out_prefix):

logFormatter = logging.Formatter("[%(levelname)s] %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.NOTSET)

consoleHandler = TqdmHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

fileHandler = logging.FileHandler(out_prefix+'.log')
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)


def set_snpid_index(df, copy=False):
if copy:
df = df.copy()
df['A1_first'] = (df['A1'] < df['A2']) | (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
df['A1s'] = df['A2'].copy()
df.loc[df['A1_first'], 'A1s'] = df.loc[df['A1_first'], 'A1'].copy()
df['A2s'] = df['A1'].copy()
df.loc[df['A1_first'], 'A2s'] = df.loc[df['A1_first'], 'A2'].copy()
df.index = df['CHR'].astype(str) + '.' + df['BP'].astype(str) + '.' + df['A1s'] + '.' + df['A2s']
df.index.name = 'snpid'
df.drop(columns=['A1_first', 'A1s', 'A2s'], inplace=True)
return df


def read_annot(annot_file):
try:
Expand Down
Binary file modified example_data/chr1.finemap_sumstats.txt.gz
Binary file not shown.
7 changes: 1 addition & 6 deletions extract_snpvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,9 @@
import numpy as np
import os
import logging
from polyfun import configure_logger
from pyarrow import ArrowIOError
from polyfun_utils import check_package_versions, configure_logger

def check_package_versions():
from pkg_resources import parse_version
if parse_version(pd.__version__) < parse_version('0.25.0'):
raise ValueError('your pandas version is too old --- please update pandas')


if __name__ == '__main__':

Expand Down
47 changes: 26 additions & 21 deletions finemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,7 @@
import glob
import subprocess
from importlib import reload


def set_snpid_index(df, copy=False):
if copy:
df = df.copy()
df['A1_first'] = (df['A1'] < df['A2']) | (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
df['A1s'] = df['A2'].copy()
df.loc[df['A1_first'], 'A1s'] = df.loc[df['A1_first'], 'A1'].copy()
df['A2s'] = df['A1'].copy()
df.loc[df['A1_first'], 'A2s'] = df.loc[df['A1_first'], 'A2'].copy()
df.index = df['CHR'].astype(str) + '.' + df['BP'].astype(str) + '.' + df['A1s'] + '.' + df['A2s']
df.index.name = 'snpid'
df.drop(columns=['A1_first', 'A1s', 'A2s'], inplace=True)
return df
from polyfun_utils import set_snpid_index



Expand Down Expand Up @@ -117,15 +104,24 @@ def __init__(self, genotypes_file, sumstats_file, n, chr_num, ldstore_exe,
self.chr = chr_num


def sync_ld_sumstats(self, ld, df_ld_snps):
def sync_ld_sumstats(self, ld, df_ld_snps, allow_missing=False):
df_ld_snps = set_snpid_index(df_ld_snps)
assert ld.shape[0] == df_ld_snps.shape[0]
assert ld.shape[0] == ld.shape[1]
df_ld = pd.DataFrame(ld, index=df_ld_snps.index, columns=df_ld_snps.index)

#make sure that all SNPs in the sumstats file are in the LD file
if not np.all(self.df_sumstats_locus.index.isin(df_ld.index)):
raise ValueError('not all SNPs in the sumstats file were found in the LD matrix')
if allow_missing:
num_missing = np.sum(~self.df_sumstats_locus.index.isin(df_ld.index))
logging.warning('%d variants with sumstats were not found in the LD file and will be omitted (please note that this may lead to false positives if the omitted SNPs are causal!)'%(num_missing))
self.df_sumstats_locus = self.df_sumstats_locus.loc[self.df_sumstats_locus.index.isin(df_ld.index)]
assert np.all(self.df_sumstats_locus.index.isin(df_ld.index))
else:
error_msg = ('not all SNPs in the sumstats file were found in the LD matrix!'
'You could drop the missing SNPs with the flag --allow-missing, but please note that'
' these omitted SNPs may be causal, in which case you may get false positive results...')
raise ValueError(error_msg)

#filter LD to only SNPs found in the sumstats file
assert not np.any(self.df_sumstats_locus.index.duplicated())
Expand All @@ -142,7 +138,7 @@ def sync_ld_sumstats(self, ld, df_ld_snps):
self.df_ld_snps = df_ld_snps


def set_locus(self, locus_start, locus_end, extract_ld=True, read_ld_matrix=False, verbose=False):
def set_locus(self, locus_start, locus_end, extract_ld=True, read_ld_matrix=False, verbose=False, allow_missing=False):

#update self.df_sumstats_locus
self.df_sumstats_locus = self.df_sumstats.query('%d <= BP <= %d'%(locus_start, locus_end))
Expand Down Expand Up @@ -248,7 +244,16 @@ def set_locus(self, locus_start, locus_end, extract_ld=True, read_ld_matrix=Fals
df_ld_snps.rename(columns={'RSID':'SNP', 'position':'BP', 'chromosome':'CHR', 'A_allele':'A1', 'B_allele':'A2'}, inplace=True, errors='raise')
df_ld_snps = set_snpid_index(df_ld_snps)
if not np.all(self.df_sumstats_locus.index.isin(df_ld_snps.index)):
raise IOError('Not all variants exist in LDStore output')
if allow_missing:
num_missing = np.sum(~self.df_sumstats_locus.index.isin(df_ld_snps.index))
logging.warning('%d variants with sumstats were not found in the LD file and will be omitted (please note that this may lead to false positives if the omitted SNPs are causal!)'%(num_missing))
self.df_sumstats_locus = self.df_sumstats_locus.loc[self.df_sumstats_locus.index.isin(df_ld_snps.index)]
assert np.all(self.df_sumstats_locus.index.isin(df_ld_snps.index))
else:
error_msg = ('not all SNPs in the sumstats file were found in the LD matrix!'
'You could drop the missing SNPs with the flag --allow-missing, but please note that'
' these omitted SNPs may be causal, in which case you may get false positive results...')
raise ValueError(error_msg)

#create incl-variants file if needed
if df_ld_snps.shape[0] == self.df_sumstats_locus.shape[0]:
Expand Down Expand Up @@ -375,7 +380,7 @@ def __init__(self, genotypes_file, sumstats_file, n, chr_num, ldstore_exe, sampl



def finemap(self, locus_start, locus_end, num_causal_snps, use_prior_causal_prob=True, prior_var=None, residual_var=None, hess=False, verbose=False, ld=None, df_ld_snps=None, debug_dir=None):
def finemap(self, locus_start, locus_end, num_causal_snps, use_prior_causal_prob=True, prior_var=None, residual_var=None, hess=False, verbose=False, ld=None, df_ld_snps=None, debug_dir=None, allow_missing=False):

#check params
if use_prior_causal_prob and 'SNPVAR' not in self.df_sumstats.columns:
Expand All @@ -384,9 +389,9 @@ def finemap(self, locus_start, locus_end, num_causal_snps, use_prior_causal_prob
raise ValueError('either both or none of ld, df_ld_SNPs should be specified')

#set locus
self.set_locus(locus_start, locus_end, read_ld_matrix=True, verbose=verbose, extract_ld=(ld is None))
self.set_locus(locus_start, locus_end, read_ld_matrix=True, verbose=verbose, extract_ld=(ld is None), allow_missing=allow_missing)
if ld is not None:
self.sync_ld_sumstats(ld, df_ld_snps)
self.sync_ld_sumstats(ld, df_ld_snps, allow_missing=allow_missing)

#define prior causal probabilities
if use_prior_causal_prob:
Expand Down
33 changes: 11 additions & 22 deletions munge_polyfun_sumstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,7 @@
import scipy.stats as stats
import logging
from pandas.api.types import is_integer_dtype

def check_package_versions():
from pkg_resources import parse_version
if parse_version(pd.__version__) < parse_version('0.25.0'):
raise ValueError('your pandas version is too old --- please update pandas')



def configure_logger(out_prefix):

logFormatter = logging.Formatter("[%(levelname)s] %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.NOTSET)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

fileHandler = logging.FileHandler(out_prefix+'.log')
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

from polyfun_utils import check_package_versions, configure_logger


def compute_Neff(df_sumstats, n, chi2_cutoff=30):
Expand Down Expand Up @@ -199,6 +178,16 @@ def sanity_checks(df_sumstats):
raise ValueError('Some chromosome values are not integers. Please double-check your input')
if not is_integer_dtype(df_sumstats['BP']):
raise ValueError('Some base-pair values are not integers. Please double-check your input')

#check for duplicates
df_snp = df_sumstats['CHR'].astype('str') + '.' + \
df_sumstats['BP'].astype('str') + '.' + \
df_sumstats['A1'].astype('str') + '.' + \
df_sumstats['A2'].astype('str')
if np.any(df_snp.duplicated()):
raise ValueError('The input file includes duplicate SNPs')

#compute Z


def convert_odds_ratio_to_log(df_sumstats):
Expand Down
Loading

0 comments on commit 02b2a7b

Please sign in to comment.