Skip to content

Commit

Permalink
compute_ldscores_from_ld doesn't exclude long-range LD regions
Browse files Browse the repository at this point in the history
  • Loading branch information
omerwe committed Apr 10, 2022
1 parent 9efb110 commit 1b6de74
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 14 deletions.
33 changes: 23 additions & 10 deletions compute_ldscores_from_ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,21 @@ def download_ukb_ld_file(chr_num, region_start, overwrite=False, ld_dir=None, no

#download the region files
for suffix in ['npz', 'gz']:

suffix_file = os.path.join(ld_dir, '%s.%s'%(ld_prefix, suffix))
url = '%s/%s.%s'%(UKBB_LD_URL, ld_prefix, suffix)
with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc='downloading %s'%(url)) as t:

#special handling for long-range LD regions
try:
urllib.request.urlopen(url)
except urllib.request.HTTPError:
url += '2'
urllib.request.urlopen(url)

with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc='downloading %s'%(url)) as t:
urllib.request.urlretrieve(url, filename=suffix_file, reporthook=t.update_to)


#load the LD matrix to memory
df_R, _ = load_ld_npz(ld_dir, ld_prefix)

Expand Down Expand Up @@ -198,15 +208,15 @@ def compute_ldscores_chr(df_annot_chr, ld_dir=None, use_ukb=False, n=None, ld_fi
assert len(df_annot_chr['CHR'].unique()) == 1
chr_num = df_annot_chr['CHR'].unique()[0]

#remove long-range LD regions
for r in LONG_RANGE_LD_REGIONS:
if r['chr'] != chr_num: continue
is_in_r = df_annot_chr['BP'].between(r['start'], r['end'])
if not np.any(is_in_r): continue
logging.warning('Removing %d SNPs from long-range LD region on chromosome %d BP %d-%d'%(is_in_r.sum(), r['chr'], r['start'], r['end']))
df_annot_chr = df_annot_chr.loc[~is_in_r]
if df_annot_chr.shape[0]==0:
raise ValueError('No SNPs found in chromosome %d (ater removing long-range LD regions)'%(chr_num))
# #remove long-range LD regions
# for r in LONG_RANGE_LD_REGIONS:
# if r['chr'] != chr_num: continue
# is_in_r = df_annot_chr['BP'].between(r['start'], r['end'])
# if not np.any(is_in_r): continue
# logging.warning('Removing %d SNPs from long-range LD region on chromosome %d BP %d-%d'%(is_in_r.sum(), r['chr'], r['start'], r['end']))
# df_annot_chr = df_annot_chr.loc[~is_in_r]
# if df_annot_chr.shape[0]==0:
# raise ValueError('No SNPs found in chromosome %d (after removing long-range LD regions)'%(chr_num))

#sort the SNPs by BP if needed
if not np.all(np.diff(df_annot_chr['BP'])>=0):
Expand Down Expand Up @@ -234,6 +244,9 @@ def compute_ldscores_chr(df_annot_chr, ld_dir=None, use_ukb=False, n=None, ld_fi
df_annot_region = df_annot_chr.query('%d <= BP <= %d'%(region_start, region_end))
if df_annot_region.shape[0]==0: continue

#skip over HLA region
if chr_num==6 and region_start in [28000001, 29000001, 30000001]: continue

#download the LD data
df_R_region = download_ukb_ld_file(chr_num, region_start, ld_dir=ld_dir, no_cache=no_cache)

Expand Down
22 changes: 18 additions & 4 deletions ldsc_polyfun/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,22 @@ def ldscore_fromlist(flist, num=None):
for fh_i, fh in enumerate(flist):
y = ldscore(fh, num)
if len(ldscore_array)>0:

#make sure that all files contain the same SNPs in the same order
if ((not series_eq(y.index, ldscore_array[0].index) or not series_eq(y.SNP, ldscore_array[0].SNP))):
raise ValueError('LD Scores for concatenation must have identical SNP columns (and A1/A2 columns if such columns exist).')
else: # keep SNP and CHR column from only the first file
y = y.drop(columns=['SNP', 'CHR'], axis=1)
all_baseline_snps_found = np.all(ldscore_array[0].index.isin(y.index))
if not all_baseline_snps_found:
raise ValueError('Some SNPs in the first set of annotations are not found in the one of the other sets of annotations')
extra_snps_found = np.any(~(y.index.isin(ldscore_array[0].index)))
if extra_snps_found:
logging.warning('some SNPs in one of the sets of annotations are not found in the first set of annotations. We will ignore these SNPs')

#reorder the SNPs to make sure that they're in the corret order
y = y.loc[ldscore_array[0].index]
assert series_eq(y.index, ldscore_array[0].index) and series_eq(y.SNP, ldscore_array[0].SNP)

# keep SNP and CHR column from only the first file
y = y.drop(columns=['SNP', 'CHR'], axis=1)

new_col_dict = {c: c + '_' + str(fh_i) for c in y.columns if c not in ['SNP', 'CHR']}
y.rename(columns=new_col_dict, inplace=True)
Expand Down Expand Up @@ -192,7 +204,7 @@ def ldscore(fh, num=None):
chr_ld = []
for i in tqdm(range(1, num+1)):
chr_ld.append(l2_parser(sub_chr(fh, i) + suffix + s, compression))
x = pd.concat(chr_ld) # automatically sorted by chromosome
x = pd.concat(chr_ld, axis=0) # automatically sorted by chromosome
del chr_ld
else: # just one file
s, compression = which_compression(fh + suffix)
Expand All @@ -205,6 +217,8 @@ def ldscore(fh, num=None):
if not is_sorted: break
if not is_sorted:
x.sort_values(by=['CHR', 'BP'], inplace=True) # SEs will be wrong unless sorted


x.drop(columns=['BP'], inplace=True)

if x.index.name == 'snpid':
Expand Down

0 comments on commit 1b6de74

Please sign in to comment.