compute_ldscores_from_ld doesn't exclude long-range LD regions

RobinM-code · Apr 10, 2022 · 1b6de74 · 1b6de74
1 parent 9efb110
commit 1b6de74
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 14 deletions.
diff --git a/compute_ldscores_from_ld.py b/compute_ldscores_from_ld.py
@@ -135,11 +135,21 @@ def download_ukb_ld_file(chr_num, region_start, overwrite=False, ld_dir=None, no
 
     #download the region files
     for suffix in ['npz', 'gz']:
+
         suffix_file = os.path.join(ld_dir, '%s.%s'%(ld_prefix, suffix))
         url = '%s/%s.%s'%(UKBB_LD_URL, ld_prefix, suffix)
-        with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc='downloading %s'%(url)) as t:                  
+
+        #special handling for long-range LD regions
+        try:
+            urllib.request.urlopen(url)
+        except urllib.request.HTTPError:
+            url += '2'
+            urllib.request.urlopen(url)        
+
+        with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc='downloading %s'%(url)) as t:
             urllib.request.urlretrieve(url, filename=suffix_file, reporthook=t.update_to)
 
+
     #load the LD matrix to memory
     df_R, _ = load_ld_npz(ld_dir, ld_prefix)
 
@@ -198,15 +208,15 @@ def compute_ldscores_chr(df_annot_chr, ld_dir=None, use_ukb=False, n=None, ld_fi
     assert len(df_annot_chr['CHR'].unique()) == 1
     chr_num = df_annot_chr['CHR'].unique()[0]
 
-    #remove long-range LD regions
-    for r in LONG_RANGE_LD_REGIONS:
-        if r['chr'] != chr_num: continue
-        is_in_r = df_annot_chr['BP'].between(r['start'], r['end'])
-        if not np.any(is_in_r): continue
-        logging.warning('Removing %d SNPs from long-range LD region on chromosome %d BP %d-%d'%(is_in_r.sum(), r['chr'], r['start'], r['end']))
-        df_annot_chr = df_annot_chr.loc[~is_in_r]
-    if df_annot_chr.shape[0]==0:
-        raise ValueError('No SNPs found in chromosome %d (ater removing long-range LD regions)'%(chr_num))
+    # #remove long-range LD regions
+    # for r in LONG_RANGE_LD_REGIONS:
+        # if r['chr'] != chr_num: continue
+        # is_in_r = df_annot_chr['BP'].between(r['start'], r['end'])
+        # if not np.any(is_in_r): continue
+        # logging.warning('Removing %d SNPs from long-range LD region on chromosome %d BP %d-%d'%(is_in_r.sum(), r['chr'], r['start'], r['end']))
+        # df_annot_chr = df_annot_chr.loc[~is_in_r]
+    # if df_annot_chr.shape[0]==0:
+        # raise ValueError('No SNPs found in chromosome %d (after removing long-range LD regions)'%(chr_num))
 
     #sort the SNPs by BP if needed
     if not np.all(np.diff(df_annot_chr['BP'])>=0):
@@ -234,6 +244,9 @@ def compute_ldscores_chr(df_annot_chr, ld_dir=None, use_ukb=False, n=None, ld_fi
             df_annot_region = df_annot_chr.query('%d <= BP <= %d'%(region_start, region_end))
             if df_annot_region.shape[0]==0: continue
 
+            #skip over HLA region
+            if chr_num==6 and region_start in [28000001, 29000001, 30000001]: continue
+
             #download the LD data
             df_R_region = download_ukb_ld_file(chr_num, region_start, ld_dir=ld_dir, no_cache=no_cache)
 

diff --git a/ldsc_polyfun/parse.py b/ldsc_polyfun/parse.py
@@ -127,10 +127,22 @@ def ldscore_fromlist(flist, num=None):
     for fh_i, fh in enumerate(flist):
         y = ldscore(fh, num)
         if len(ldscore_array)>0:
+
+            #make sure that all files contain the same SNPs in the same order
             if ((not series_eq(y.index, ldscore_array[0].index) or not series_eq(y.SNP, ldscore_array[0].SNP))):
-                raise ValueError('LD Scores for concatenation must have identical SNP columns (and A1/A2 columns if such columns exist).')
-            else:  # keep SNP and CHR column from only the first file
-                y = y.drop(columns=['SNP', 'CHR'], axis=1)
+                all_baseline_snps_found = np.all(ldscore_array[0].index.isin(y.index))
+                if not all_baseline_snps_found:
+                    raise ValueError('Some SNPs in the first set of annotations are not found in the one of the other sets of annotations')
+                extra_snps_found = np.any(~(y.index.isin(ldscore_array[0].index)))
+                if extra_snps_found:
+                    logging.warning('some SNPs in one of the sets of annotations are not found in the first set of annotations. We will ignore these SNPs')
+
+                #reorder the SNPs to make sure that they're in the corret order
+                y = y.loc[ldscore_array[0].index]
+                assert series_eq(y.index, ldscore_array[0].index) and series_eq(y.SNP, ldscore_array[0].SNP)
+
+            # keep SNP and CHR column from only the first file
+            y = y.drop(columns=['SNP', 'CHR'], axis=1)
 
         new_col_dict = {c: c + '_' + str(fh_i) for c in y.columns if c not in ['SNP', 'CHR']}
         y.rename(columns=new_col_dict, inplace=True)
@@ -192,7 +204,7 @@ def ldscore(fh, num=None):
         chr_ld = []
         for i in tqdm(range(1, num+1)):
             chr_ld.append(l2_parser(sub_chr(fh, i) + suffix + s, compression))
-        x = pd.concat(chr_ld)  # automatically sorted by chromosome
+        x = pd.concat(chr_ld, axis=0)  # automatically sorted by chromosome
         del chr_ld
     else:  # just one file
         s, compression = which_compression(fh + suffix)
@@ -205,6 +217,8 @@ def ldscore(fh, num=None):
         if not is_sorted: break            
     if not is_sorted:
         x.sort_values(by=['CHR', 'BP'], inplace=True) # SEs will be wrong unless sorted
+
+
     x.drop(columns=['BP'], inplace=True)
 
     if x.index.name == 'snpid':