More robust conversion of chromosome numbers to int

omerwe · omerwe · commit 8b17b4ec87fd · 2024-12-09T22:59:06.000+02:00
diff --git a/ldsc_polyfun/jackknife.py b/ldsc_polyfun/jackknife.py
@@ -574,7 +574,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T
         num_lambdas=100, approx_ridge=False, 
         ridge_lambda=None, use_1se=False, has_intercept=False, standardize=True,
         skip_ridge_jackknife=True, num_chr_sets=2, num_chr=22):
-        
+                    
         #sanity checks
         assert chr_num is not None
         # # # chr_num[:100000]=1
diff --git a/ldsc_polyfun/parse.py b/ldsc_polyfun/parse.py
@@ -32,12 +32,20 @@ def read_csv(fh, **kwargs):
     return df
     
 def set_snpid_index(df):
+    
+    def float_to_int(c):
+        try:
+            c = int(c)
+        except ValueError:
+            pass
+        return c
+    
     df['A1_first'] = (df['A1'] < df['A2']) | (df['A1'].str.len()>1) | (df['A2'].str.len()>1)
     df['A1s'] = df['A2'].copy()
     df.loc[df['A1_first'], 'A1s'] = df.loc[df['A1_first'], 'A1'].copy()
     df['A2s'] = df['A1'].copy()
     df.loc[df['A1_first'], 'A2s'] = df.loc[df['A1_first'], 'A2'].copy()
-    s_chr = df['CHR'].map(lambda c: int(c) if str(c)[0] in ['0','1','2','3','4','5,','6','7','8','9'] else c).astype(str)
+    s_chr = df['CHR'].map(float_to_int).astype(str)
     s_bp = df['BP'].astype(int).astype(str)
     df.index = s_chr + '.' + s_bp + '.' + df['A1s'] + '.' + df['A2s']
     df.index.name = 'snpid'
@@ -116,7 +124,7 @@ def sumstats(fh, alleles=True, dropna=True):
     if dropna:
         x = x.dropna(how='any')
         
-    x = set_snpid_index(x)
+    x = set_snpid_index(x)    
     x.drop(columns=['CHR', 'BP'], inplace=True)
 
 
diff --git a/ldsc_polyfun/sumstats.py b/ldsc_polyfun/sumstats.py
@@ -236,7 +236,7 @@ def _print_part_delete_values(ldscore_reg, ofh, log):
 
 def _merge_and_log(ld, sumstats, noun, log):
     '''Wrap smart merge with log messages about # of SNPs.'''
-    sumstats = smart_merge(ld, sumstats)
+    sumstats = smart_merge(ld, sumstats)    
     msg = 'After merging with {F}, {N} SNPs remain.'
     if len(sumstats) == 0:
         msg += ' Please make sure that your annotation files include the SNPs in your sumstats files (please see the PolyFun wiki for details on downloading functional annotations)'
@@ -275,6 +275,7 @@ def _read_ld_sumstats(args, log, fh, alleles=True, dropna=True):
     
     M_annot, ref_ld, novar_cols = _check_variance(log, M_annot, ref_ld)
     w_ld = _read_w_ld(args, log)
+        
     sumstats = _merge_and_log(ref_ld, sumstats, 'reference panel LD', log)
     sumstats = _merge_and_log(sumstats, w_ld, 'regression SNP LD', log)
     w_ld_cname = sumstats.columns[-1]
diff --git a/polyfun.py b/polyfun.py
@@ -188,7 +188,7 @@ def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2):
                 df_sumstats = pd.read_table(args.sumstats, sep='\s+')            
             ###merge everything together...
             
-        #prepare LD-scores for S-LDSC run
+        #prepare LD-scores for S-LDSC run        
         ref_ld = np.array(df_sumstats[ref_ld_cnames], dtype=np.float32)
         sumstats._check_ld_condnum(args, log, ref_ld_cnames)
         if df_sumstats.shape[0] < 200000: