Skip to content

Commit

Permalink
workaround for pandas 1.1.1 bug with delim_whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
omerwe committed Sep 6, 2020
1 parent 9027c48 commit 7227ed5
Show file tree
Hide file tree
Showing 11 changed files with 19 additions and 19 deletions.
2 changes: 1 addition & 1 deletion aggregate_finemapper_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main(args):
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
df_sumstats = pd.read_table(args.sumstats, sep='\s+')

#compute p-values if needed
if args.pvalue_cutoff is not None:
Expand Down
4 changes: 2 additions & 2 deletions compute_ldscores.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def compute_ldscores(args):
try:
df_annot = pd.read_parquet(args.annot)
except (ArrowIOError, ArrowInvalid):
df_annot = pd.read_table(args.annot, delim_whitespace=True)
df_annot = pd.read_table(args.annot, sep='\s+')

#Remove annotations of SNPs that are not in the .bim file
df_annot = set_snpid_index(df_annot)
Expand All @@ -75,7 +75,7 @@ def compute_ldscores(args):

#find #individuals in bfile
fam_file = args.bfile+'.fam'
df_fam = pd.read_table(fam_file, header=None, usecols=[5], delim_whitespace=True)
df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+')
n = df_fam.shape[0]

#find keep_indivs
Expand Down
4 changes: 2 additions & 2 deletions compute_ldscores_ukb.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def read_annot(annot_file):
try:
df_annot = pd.read_parquet(annot_file)
except (ArrowIOError, ArrowInvalid):
df_annot = pd.read_table(annot_file, delim_whitespace=True)
df_annot = pd.read_table(annot_file, sep='\s+')

assert 'CHR' in df_annot.columns
assert 'SNP' in df_annot.columns
Expand Down Expand Up @@ -56,7 +56,7 @@ def load_ld_matrix(ld_dir, ld_prefix):
#load the SNPs metadata
gz_file = os.path.join(ld_dir, '%s.gz'%(ld_prefix))
try:
df_ld_snps = pd.read_table(gz_file, delim_whitespace=True)
df_ld_snps = pd.read_table(gz_file, sep='\s+')
except (ArrowIOError, ArrowInvalid):
raise IOError('Corrupt file downloaded')
df_ld_snps.rename(columns={'rsid':'SNP', 'chromosome':'CHR', 'position':'BP', 'allele1':'A1', 'allele2':'A2'}, inplace=True, errors='ignore')
Expand Down
2 changes: 1 addition & 1 deletion create_finemapper_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def main(args):
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
df_sumstats = pd.read_table(args.sumstats, sep='\s+')

#compute p-values if needed
if args.pvalue_cutoff is not None:
Expand Down
4 changes: 2 additions & 2 deletions extract_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
try:
df_snps = pd.read_parquet(args.pips)
except (ArrowIOError, ArrowInvalid):
df_snps = pd.read_table(args.pips, delim_whitespace=True)
df_snps = pd.read_table(args.pips, sep='\s+')
if 'A1' not in df_snps.columns:
raise ValueError('missing column A1')
if 'A2' not in df_snps.columns:
Expand All @@ -59,7 +59,7 @@
try:
df_annot = pd.read_parquet(args.annot)
except (ArrowIOError, ArrowInvalid):
df_annot = pd.read_table(args.annot, delim_whitespace=True)
df_annot = pd.read_table(args.annot, sep='\s+')
df_annot = set_snpid_index(df_annot)
logging.info('Done in %0.2f seconds'%(time.time() - t0))

Expand Down
2 changes: 1 addition & 1 deletion extract_snpvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
try:
df_snps = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_snps = pd.read_table(args.sumstats, delim_whitespace=True)
df_snps = pd.read_table(args.sumstats, sep='\s+')
if 'A1' not in df_snps.columns:
raise ValueError('missing column A1')
if 'A2' not in df_snps.columns:
Expand Down
4 changes: 2 additions & 2 deletions finemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def load_ld_npz(ld_prefix):
if os.path.exists(snps_filename_parquet):
df_ld_snps = pd.read_parquet(snps_filename_parquet)
elif os.path.exists(snps_filename_gz):
df_ld_snps = pd.read_table(snps_filename_gz, delim_whitespace=True)
df_ld_snps = pd.read_table(snps_filename_gz, sep='\s+')
df_ld_snps.rename(columns={'allele1':'A1', 'allele2':'A2', 'position':'BP', 'chromosome':'CHR', 'rsid':'SNP'}, inplace=True, errors='ignore')
else:
raise ValueError('couldn\'t find SNPs file %s or %s'%(snps_filename_parquet, snps_filename_gz))
Expand Down Expand Up @@ -227,7 +227,7 @@ def __init__(self, genotypes_file, sumstats_file, n, chr_num, ldstore_exe,
try:
df_sumstats = pd.read_parquet(sumstats_file)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(sumstats_file, delim_whitespace=True)
df_sumstats = pd.read_table(sumstats_file, sep='\s+')
if not np.any(df_sumstats['CHR'] == chr_num):
raise IOError('sumstats file does not include any SNPs in chromosome %s'%(chr_num))
if np.any(df_sumstats['CHR'] != chr_num):
Expand Down
2 changes: 1 addition & 1 deletion munge_polyfun_sumstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def convert_odds_ratio_to_log(df_sumstats):
#read sumstats file
logging.info('Reading sumstats file...')
t0 = time.time()
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
df_sumstats = pd.read_table(args.sumstats, sep='\s+')
logging.info('Done in %0.2f seconds'%(time.time()-t0))

#convert odds-ratio to log-odds ratio if needed
Expand Down
6 changes: 3 additions & 3 deletions polyfun.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2):
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
df_sumstats = pd.read_table(args.sumstats, sep='\s+')
###merge everything together...

#prepare LD-scores for S-LDSC run
Expand Down Expand Up @@ -556,7 +556,7 @@ def save_snpvar_to_disk(self, args, use_ridge, constrain_range):
try:
df_sumstats = pd.read_parquet(args.sumstats)
except (ArrowIOError, ArrowInvalid):
df_sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
df_sumstats = pd.read_table(args.sumstats, sep='\s+')
df_sumstats.drop(columns=['SNP'], errors='ignore', inplace=True)
for col in ['CHR', 'BP', 'A1', 'A2']:
if col not in df_sumstats.columns:
Expand Down Expand Up @@ -687,7 +687,7 @@ def compute_ldscores_plink_chr(self, args, chr_num, df_bins_chr):

#find #individuals in bfile
fam_file = get_file_name(args, 'fam', chr_num)
df_fam = pd.read_table(fam_file, header=None, usecols=[5], delim_whitespace=True)
df_fam = pd.read_table(fam_file, header=None, usecols=[5], sep='\s+')
n = df_fam.shape[0]

#find keep_indivs
Expand Down
4 changes: 2 additions & 2 deletions polyloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def load_posterior_betas(self, args):
try:
df_posterior = pd.read_parquet(args.posterior)
except (ArrowIOError, ArrowInvalid):
df_posterior = pd.read_table(args.posterior, delim_whitespace=True)
df_posterior = pd.read_table(args.posterior, sep='\s+')

#preprocess columns
df_posterior.columns = df_posterior.columns.str.upper()
Expand All @@ -147,7 +147,7 @@ def polyloc_partitions(self, args):
#add another partition for all SNPs not in the posterior file
df_bim_list = []
for chr_num in range(1,23):
df_bim_chr = pd.read_table(args.bfile_chr+'%d.bim'%(chr_num), delim_whitespace=True, names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], header=None)
df_bim_chr = pd.read_table(args.bfile_chr+'%d.bim'%(chr_num), sep='\s+', names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], header=None)
df_bim_list.append(df_bim_chr)
df_bim = pd.concat(df_bim_list, axis=0)
df_bim = set_snpid_index(df_bim)
Expand Down
4 changes: 2 additions & 2 deletions test_polyfun.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ def compare_dfs(dir1, dir2, filename, sort_column=None):
if not os.path.exists(file2):
raise IOError('%s not found'%(file2))
if file1.endswith('.parquet'): df1 = pd.read_parquet(file1)
else: df1 = pd.read_table(file1, delim_whitespace=True)
else: df1 = pd.read_table(file1, sep='\s+')
if file2.endswith('.parquet'): df2 = pd.read_parquet(file2)
else: df2 = pd.read_table(file2, delim_whitespace=True)
else: df2 = pd.read_table(file2, sep='\s+')
assert np.all(df1.shape == df2.shape), 'found dimension mismatch between %s and %s'%(file1, file2)
assert np.all(df1.columns == df2.columns), 'found mismatch between %s and %s'%(file1, file2)
if sort_column is not None:
Expand Down

0 comments on commit 7227ed5

Please sign in to comment.