@@ -105,6 +105,8 @@ def compute_prs_for_file(args,
105
105
plink_cmd += ' --bfile %s --score %s sum' % (plink_file_prefix , betas_file )
106
106
else :
107
107
raise ValueError ('neither --bed nor --pgen specified' )
108
+ if args .center :
109
+ plink_cmd += ' center'
108
110
if ranges_file is not None :
109
111
scores_file = os .path .join (temp_dir , next (tempfile ._get_candidate_names ()))
110
112
df_betas [['SNP_bim' , 'score' ]].drop_duplicates ('SNP_bim' ).to_csv (scores_file , sep = '\t ' , header = False , index = False )
@@ -175,6 +177,7 @@ def load_betas_files(betas_file, verbose=True):
175
177
176
178
#rename columns if needed
177
179
df_betas .rename (columns = {'sid' :'SNP' , 'nt1' :'A1' , 'nt2' :'A2' , 'BETA_MEAN' :'BETA' , 'ldpred_inf_beta' :'BETA' , 'chrom' :'CHR' , 'Chrom' :'CHR' , 'pos' :'BP' }, inplace = True , errors = 'ignore' )
180
+
178
181
if not is_numeric_dtype (df_betas ['CHR' ]):
179
182
if df_betas ['CHR' ].str .startswith ('chrom_' ).all ():
180
183
df_betas ['CHR' ] = df_betas ['CHR' ].str [6 :].astype (np .int64 )
@@ -263,124 +266,191 @@ def computs_prs_all_files(args, betas_file, disable_jackknife=False, keep_file=N
263
266
264
267
265
268
266
- def estimate_mixing_weights (args ):
269
+ def compute_prs (args ):
270
+
271
+ #if we need to perform predictions, make sure the mixweights file is found
272
+ if args .predict and args .betas .count (',' ) > 0 :
273
+ mixweights_file = args .mixweights_prefix + '.mixweights'
274
+ if not os .path .exists (mixweights_file ):
275
+ raise ValueError ('mixweights file %s not found' % (mixweights_file ))
267
276
268
- #read phenotypes
269
- df_pheno = pd .read_csv (args .pheno , names = ['FID' , 'IID' , 'PHENO' ], index_col = 'IID' , delim_whitespace = True )
270
-
271
- #make sure that we didn't include a header line
272
- try :
273
- float (df_pheno ['PHENO' ].iloc [0 ])
274
- except :
275
- df_pheno = df_pheno .iloc [1 :]
276
- df_pheno ['PHENO' ] = df_pheno ['PHENO' ].astype (np .float64 )
277
- if np .any (df_pheno .index .duplicated ()):
278
- raise ValueError ('duplicate ids found in %s' % (args .pheno ))
279
277
280
278
#compute a PRS for each beta file
281
279
beta_files = args .betas .split (',' )
282
- df_prs_sum_list = []
280
+ df_prs_list = []
283
281
for betas_file in beta_files :
284
- df_prs_sum = computs_prs_all_files (args , betas_file , disable_jackknife = True , keep_file = args .pheno )
285
- df_prs_sum_list .append (df_prs_sum [['SCORESUM' ]])
286
- for df_prs_sum in df_prs_sum_list :
287
- assert np .all (df_prs_sum .index == df_prs_sum_list [0 ].index )
288
- df_prs_sum_all = pd .concat (df_prs_sum_list , axis = 1 )
289
-
290
- #sync df_pheno and df_prs_sum_all
291
- df_prs_sum_all .index = df_prs_sum_all .index .astype (str )
292
- df_pheno .index = df_pheno .index .astype (str )
293
- index_shared = df_prs_sum_all .index .intersection (df_pheno .index )
294
- assert len (index_shared )> 0
295
- if len (index_shared ) < df_prs_sum_all .shape [0 ]:
296
- df_prs_sum_all = df_prs_sum_all .loc [index_shared ]
297
- if df_pheno .shape [0 ] != df_prs_sum_all .shape [0 ] or np .any (df_prs_sum_all .index != df_pheno .index ):
298
- df_pheno = df_pheno .loc [df_prs_sum_all .index ]
282
+ df_prs = computs_prs_all_files (args , betas_file , disable_jackknife = not args .predict , keep_file = args .pheno )
283
+ df_prs_list .append (df_prs )
284
+ for df_prs in df_prs_list :
285
+ assert np .all (df_prs .index == df_prs_list [0 ].index )
286
+ df_prs_all = pd .concat (df_prs_list , axis = 1 )
287
+
288
+
289
+ #compute mixing weights if needed
290
+ if args .estimate_mixweights :
291
+
292
+ #read phenotypes
293
+ df_pheno = pd .read_csv (args .pheno , names = ['FID' , 'IID' , 'PHENO' ], index_col = 'IID' , delim_whitespace = True )
294
+
295
+ #make sure that we didn't include a header line
296
+ try :
297
+ float (df_pheno ['PHENO' ].iloc [0 ])
298
+ except :
299
+ df_pheno = df_pheno .iloc [1 :]
300
+ df_pheno ['PHENO' ] = df_pheno ['PHENO' ].astype (np .float64 )
301
+ if np .any (df_pheno .index .duplicated ()):
302
+ raise ValueError ('duplicate ids found in %s' % (args .pheno ))
303
+
304
+ #sync df_pheno and df_prs_all
305
+ df_prs_all .index = df_prs_all .index .astype (str )
306
+ df_pheno .index = df_pheno .index .astype (str )
307
+ index_shared = df_prs_all .index .intersection (df_pheno .index )
308
+ assert len (index_shared )> 0
309
+ if len (index_shared ) < df_prs_all .shape [0 ]:
310
+ df_prs_all = df_prs_all .loc [index_shared ]
311
+ if df_pheno .shape [0 ] != df_prs_all .shape [0 ] or np .any (df_prs_all .index != df_pheno .index ):
312
+ df_pheno = df_pheno .loc [df_prs_all .index ]
313
+
314
+ #extract just the SCORESUM columns
315
+ df_prs_sum_all = df_prs_all ['SCORESUM' ].copy ()
299
316
300
- #flip PRS that are negatively correlated with the phenotype
301
- is_flipped = np .zeros (df_prs_sum_all .shape [1 ], dtype = bool )
302
- linreg_univariate = LinearRegression ()
303
- for c_i in range (df_prs_sum_all .shape [1 ]):
304
- linreg_univariate .fit (df_prs_sum_all .iloc [:, [c_i ]], df_pheno ['PHENO' ])
305
- is_flipped [c_i ] = linreg_univariate .coef_ [0 ] < 0
306
- df_prs_sum_all .loc [:, is_flipped ] *= - 1
307
-
308
- #compute mixing weights
309
- linreg = LinearRegression (positive = not args .allow_neg_mixweights )
310
- linreg .fit (df_prs_sum_all , df_pheno ['PHENO' ])
311
- mix_weights , intercept = linreg .coef_ , linreg .intercept_
312
- r2_score = metrics .r2_score (df_pheno ['PHENO' ], linreg .predict (df_prs_sum_all ))
313
- logging .info ('In-sample R2: %0.3f' % (r2_score ))
314
-
315
- #create and print df_coef, and save it to disk
316
- df_coef = pd .Series (mix_weights , index = beta_files )
317
- df_coef .loc ['intercept' ] = intercept
318
- mix_weights_file = args .output_prefix + '.mixweights'
319
- df_coef .to_frame (name = 'mix_weight' ).to_csv (mix_weights_file , sep = '\t ' )
320
- logging .info ('Writing mixing weights to %s' % (mix_weights_file ))
321
-
322
- #compute weighted betas
323
- df_betas_weighted = None
324
- for is_flipped_beta , betas_file , mix_weight in zip (is_flipped , beta_files , mix_weights ):
325
- df_betas = load_betas_files (betas_file )
326
- df_betas = df_betas [['SNP' , 'CHR' , 'BP' , 'A1' , 'A2' , 'BETA' ]]
327
- df_betas ['BETA' ] *= mix_weight
328
- if is_flipped_beta : df_betas ['BETA' ] = - df_betas ['BETA' ]
329
- if df_betas_weighted is None :
330
- df_betas_weighted = df_betas
331
- continue
317
+ #flip PRS that are negatively correlated with the phenotype
318
+ is_flipped = np .zeros (df_prs_sum_all .shape [1 ], dtype = bool )
319
+ linreg_univariate = LinearRegression ()
320
+ for c_i in range (df_prs_sum_all .shape [1 ]):
321
+ linreg_univariate .fit (df_prs_sum_all .iloc [:, [c_i ]], df_pheno ['PHENO' ])
322
+ is_flipped [c_i ] = linreg_univariate .coef_ [0 ] < 0
323
+ df_prs_sum_all .loc [:, is_flipped ] *= - 1
324
+
325
+ #estimate mixing weights
326
+ linreg = LinearRegression (positive = not args .allow_neg_mixweights )
327
+ linreg .fit (df_prs_sum_all , df_pheno ['PHENO' ])
328
+ mix_weights , intercept = linreg .coef_ , linreg .intercept_
329
+ r2_score = metrics .r2_score (df_pheno ['PHENO' ], linreg .predict (df_prs_sum_all ))
330
+ logging .info ('In-sample R2: %0.3f' % (r2_score ))
332
331
333
- index_shared = df_betas .index .intersection (df_betas_weighted .index )
334
- df_betas ['BETA2' ] = df_betas ['BETA' ]
335
- df_new = df_betas_weighted .loc [index_shared ].merge (df_betas .loc [index_shared , ['BETA2' ]], left_index = True , right_index = True )
336
- df_new ['BETA' ] += df_new ['BETA2' ]
337
- del df_new ['BETA2' ]
338
- del df_betas ['BETA2' ]
339
- df_list = [df_new , df_betas .loc [~ df_betas .index .isin (index_shared )], df_betas_weighted .loc [~ df_betas_weighted .index .isin (index_shared )]]
340
- df_betas_weighted = pd .concat (df_list , axis = 0 )
341
- df_betas_weighted .sort_values (['CHR' , 'BP' , 'A1' ], inplace = True )
342
-
343
- #save output to file
344
- df_betas_weighted [['SNP' , 'CHR' , 'BP' , 'A1' , 'A2' , 'BETA' ]].to_csv (args .output_prefix + '.betas' , sep = '\t ' , index = False , float_format = '%0.6e' )
345
- logging .info ('Saving weighted betas to %s' % (args .output_prefix + '.betas' ))
332
+ #create and print df_coef, and save it to disk
333
+ df_coef = pd .Series (mix_weights , index = beta_files )
334
+ df_coef .loc [is_flipped ] *= - 1
335
+ df_coef .loc ['intercept' ] = intercept
336
+ mix_weights_file = args .output_prefix + '.mixweights'
337
+ df_coef .to_frame (name = 'mix_weight' ).to_csv (mix_weights_file , sep = '\t ' )
338
+ logging .info ('Writing mixing weights to %s' % (mix_weights_file ))
339
+
340
+ #flip the PRS back
341
+ df_prs_sum_all .loc [:, is_flipped ] *= - 1
346
342
347
-
348
- def compute_prs (args ):
343
+ #perform predictions
344
+ if args .predict :
345
+
346
+ #extract just the SCORESUM columns
347
+ df_prs_sum_all = df_prs_all ['SCORESUM' ]
348
+
349
+ #just take the PRS if there's only a single beta
350
+ if args .betas .count (',' ) == 0 :
351
+ assert (df_prs_all .columns == 'SCORESUM' ).sum () == 1
352
+ s_combined_prs = df_prs_sum_all
353
+
354
+ #if there's more than one beta, take the linear combination
355
+ else :
356
+ mixweights_file = args .mixweights_prefix + '.mixweights'
357
+ s_mixweights = pd .read_csv (mixweights_file , delim_whitespace = True , squeeze = True )
358
+ if np .any (s_mixweights .index [:- 1 ] != args .betas .split (',' )):
359
+ raise ValueError ('The provided betas file do not match the mix weights file' )
360
+ assert s_mixweights .index [- 1 ] == 'intercept'
361
+ s_combined_prs = df_prs_sum_all .dot (s_mixweights .iloc [:- 1 ].values ) + s_mixweights .loc ['intercept' ]
362
+
363
+ #save the PRS to disk
364
+ df_prs_sum = s_combined_prs .reset_index (drop = False )
365
+ df_prs_sum .columns = ['IID' , 'PRS' ]
366
+ df_prs_sum ['FID' ] = df_prs_sum ['IID' ]
367
+ df_prs_sum = df_prs_sum [['FID' , 'IID' , 'PRS' ]]
368
+ df_prs_sum .to_csv (args .output_prefix + '.prs' , sep = '\t ' , index = False , float_format = '%0.5f' )
369
+
370
+ #handle jackknife
371
+ set_jk_columns = set ([c for c in df_prs_all .columns if '.jk' in c ])
372
+ df_prs_sum_jk = pd .DataFrame (index = df_prs_all .index , columns = set_jk_columns )
373
+ if df_prs_sum_jk .shape [1 ] > 1 :
374
+ for jk_column in set_jk_columns :
375
+ if args .betas .count (',' ) == 0 :
376
+ assert (df_prs_all .columns == jk_column ).sum () == 1
377
+ df_prs_sum_jk [jk_column ] = df_prs_all [jk_column ]
378
+ else :
379
+ #import ipdb; ipdb.set_trace()
380
+ df_prs_sum_jk [jk_column ] = df_prs_all [jk_column ].dot (s_mixweights .iloc [:- 1 ].values ) + s_mixweights .loc ['intercept' ]
381
+
382
+ df_prs_sum_jk .reset_index ().to_csv (args .output_prefix + '.prs_jk' , sep = '\t ' , index = False , float_format = '%0.5f' )
383
+
384
+ logging .info ('Saving PRS to %s' % (args .output_prefix + '.prs' ))
385
+
386
+
387
+
388
+
389
+ # #compute weighted betas
390
+ # df_betas_weighted = None
391
+ # for is_flipped_beta, betas_file, mix_weight in zip(is_flipped, beta_files, mix_weights):
392
+ # df_betas = load_betas_files(betas_file)
393
+ # df_betas = df_betas[['SNP', 'CHR', 'BP', 'A1', 'A2', 'BETA']]
394
+ # df_betas['BETA'] *= mix_weight
395
+ # if is_flipped_beta: df_betas['BETA'] = -df_betas['BETA']
396
+ # if df_betas_weighted is None:
397
+ # df_betas_weighted = df_betas
398
+ # continue
399
+
400
+ # index_shared = df_betas.index.intersection(df_betas_weighted.index)
401
+ # df_betas['BETA2'] = df_betas['BETA']
402
+ # df_new = df_betas_weighted.loc[index_shared].merge(df_betas.loc[index_shared, ['BETA2']], left_index=True, right_index=True)
403
+ # df_new['BETA'] += df_new['BETA2']
404
+ # del df_new['BETA2']
405
+ # del df_betas['BETA2']
406
+ # df_list = [df_new, df_betas.loc[~df_betas.index.isin(index_shared)], df_betas_weighted.loc[~df_betas_weighted.index.isin(index_shared)]]
407
+ # df_betas_weighted = pd.concat(df_list, axis=0)
408
+ # df_betas_weighted.sort_values(['CHR', 'BP', 'A1'], inplace=True)
409
+
410
+ # #save weighted betas to file
411
+ # df_betas_weighted[['SNP', 'CHR', 'BP', 'A1', 'A2', 'BETA']].to_csv(args.output_prefix+'.betas', sep='\t', index=False, float_format='%0.6e')
412
+ # logging.info('Saving weighted betas to %s'%(args.output_prefix+'.betas'))
349
413
350
- if args .betas .count (',' ) > 0 :
351
- raise ValueError ('--predict can only be used with a single betas file' )
352
- df_prs_sum = computs_prs_all_files (args , args .betas , disable_jackknife = False , keep_file = args .keep )
353
- df_prs_sum .reset_index (inplace = True , drop = False )
354
- df_prs_sum .columns = df_prs_sum .columns .str .replace ('SCORESUM' , 'PRS' )
355
- df_prs_sum_main = df_prs_sum [['FID' , 'IID' , 'PRS' ]]
356
- df_prs_sum_jk = df_prs_sum [['FID' , 'IID' ] + [c for c in df_prs_sum .columns if c .startswith ('PRS.' )]]
357
-
358
- df_prs_sum_main .to_csv (args .output_prefix + '.prs' , sep = '\t ' , index = False , float_format = '%0.5f' )
359
- if df_prs_sum_jk .shape [1 ]> 1 :
360
- df_prs_sum_jk .to_csv (args .output_prefix + '.prs_jk' , sep = '\t ' , index = False , float_format = '%0.5f' )
361
- logging .info ('Saving PRS to %s' % (args .output_prefix + '.prs' ))
414
+
415
+ # def compute_prs(args):
416
+
417
+ # if args.betas.count(',') > 0:
418
+ # raise ValueError('--predict can only be used with a single betas file')
419
+ # df_prs_sum = computs_prs_all_files(args, args.betas, disable_jackknife=False, keep_file=args.keep)
420
+ # df_prs_sum.reset_index(inplace=True, drop=False)
421
+ # df_prs_sum.columns = df_prs_sum.columns.str.replace('SCORESUM', 'PRS')
422
+ # df_prs_sum_main = df_prs_sum[['FID', 'IID', 'PRS']]
423
+ # df_prs_sum_jk = df_prs_sum[['FID', 'IID'] + [c for c in df_prs_sum.columns if c.startswith('PRS.')]]
424
+
425
+ # df_prs_sum_main.to_csv(args.output_prefix+'.prs', sep='\t', index=False, float_format='%0.5f')
426
+ # if df_prs_sum_jk.shape[1]>1:
427
+ # df_prs_sum_jk.to_csv(args.output_prefix+'.prs_jk', sep='\t', index=False, float_format='%0.5f')
428
+ # logging.info('Saving PRS to %s'%(args.output_prefix+'.prs'))
362
429
363
430
364
431
def check_args (args ):
365
- if int (args .predict ) + int (args .combine_betas ) != 1 :
366
- raise ValueError ('you must specify either --predict or --combine-betas (but not both)' )
432
+ if int (args .predict ) + int (args .estimate_mixweights ) != 1 :
433
+ raise ValueError ('you must specify either --predict or --estimate-mixweights (but not both)' )
367
434
if args .plink_exe is None and args .plink2_exe is None :
368
435
raise ValueError ('you must specify either --plink-exe or --plink2-exe' )
369
436
if args .plink_exe is not None and not os .path .exists (args .plink_exe ):
370
437
raise ValueError ('%s not found' % (args .plink_exe ))
371
438
if args .plink2_exe is not None and not os .path .exists (args .plink2_exe ):
372
439
raise ValueError ('%s not found' % (args .plink2_exe ))
373
- if args .combine_betas :
440
+ if args .estimate_mixweights :
374
441
if args .keep is not None :
375
- raise ValueError ('you cannot provide both --combine-betas and --keep' )
442
+ raise ValueError ('you cannot provide both --estimate-mixweights and --keep' )
376
443
if args .pheno is None :
377
- raise ValueError ('you must provide --pheno if you specify --combine-betas ' )
444
+ raise ValueError ('you must provide --pheno if you specify --estimate-mixweights ' )
378
445
if args .betas .count (',' )== 0 :
379
- raise ValueError ('you must provide multiple files in --betas if you specify --combine-betas' )
446
+ raise ValueError ('you must provide multiple files in --betas if you specify --estimate-mixweights' )
447
+ if args .predict :
448
+ if args .mixweights_prefix is None and args .betas .count (',' ) > 0 :
449
+ raise ValueError ('you must provide --mixweights-prefix together with --predict if you have more than one beta file' )
380
450
if args .num_jk < 0 :
381
451
raise ValueError ('--num-jk must be >=0' )
382
452
if args .pheno is not None and args .predict :
383
- raise ValueError ('--pheno can only be used with --combine-betas ' )
453
+ raise ValueError ('--pheno can only be used with --estimate-mixweights ' )
384
454
385
455
if len (list (args .files )) == 0 :
386
456
raise ValueError ('no input files specified' )
@@ -391,9 +461,10 @@ def check_args(args):
391
461
parser = argparse .ArgumentParser ()
392
462
393
463
parser .add_argument ('--betas' , required = True , help = 'files with SNP effect sizes (comma separated). A1 is the effect allele.' )
464
+ parser .add_argument ('--mixweights-prefix' , help = 'Prefix of files with mixing weights (required if you use --predict with more than one betas file' )
394
465
parser .add_argument ('--output-prefix' , required = True , help = 'Prefix of output file' )
395
466
396
- parser .add_argument ('--combine-betas ' , default = False , action = 'store_true' , help = 'If specified, PolyPred will estimate mixing weights' )
467
+ parser .add_argument ('--estimate-mixweights ' , default = False , action = 'store_true' , help = 'If specified, PolyPred will estimate mixing weights' )
397
468
parser .add_argument ('--allow-neg-mixweights' , default = False , action = 'store_true' , help = 'If specified, PolyPred will not enforce non-negative mixing weights' )
398
469
parser .add_argument ('--predict' , default = False , action = 'store_true' , help = 'If specified, PolyPred will compute PRS' )
399
470
parser .add_argument ('--pheno' , default = None , help = 'Phenotype file (required for estimating mixing weights)' )
@@ -403,6 +474,7 @@ def check_args(args):
403
474
parser .add_argument ('--extract' , default = None , help = 'A text file with rsids of SNPs to use (one per line)' )
404
475
parser .add_argument ('--keep' , default = None , help = 'A text file with ids of individuals to use (two columns per line, each containing FID,IID)' )
405
476
parser .add_argument ('--num-jk' , type = int , default = 200 , help = 'number of genomic jackknife blocks' )
477
+ parser .add_argument ('--center' , default = False , action = 'store_true' , help = 'If specified, the PRS will be centered' )
406
478
407
479
parser .add_argument ('--memory' , type = int , default = 2 , help = 'Maximum memory usage (in GB)' )
408
480
parser .add_argument ('--threads' , type = int , default = 1 , help = 'Number of CPU threads' )
@@ -420,21 +492,18 @@ def check_args(args):
420
492
421
493
#check that the output directory exists
422
494
if len (os .path .dirname (args .output_prefix ))> 0 and not os .path .exists (os .path .dirname (args .output_prefix )):
423
- raise ValueError ('output directory %s doesn\' t exist' % (os .path .dirname (args .output_prefix )))
495
+ raise ValueError ('output directory %s doesn\' t exist' % (os .path .dirname (args .output_prefix )))
496
+
497
+
424
498
425
499
#configure logger
426
500
configure_logger (args .output_prefix )
427
501
428
502
#check arguments
429
503
check_args (args )
430
504
431
- #estimate mixing weights if needed
432
- if args .combine_betas :
433
- estimate_mixing_weights (args )
434
-
435
- #compute PRS if needed
436
- if args .predict :
437
- compute_prs (args )
505
+ #Estimate mixiwing weights and/or compute PRS
506
+ compute_prs (args )
438
507
439
508
print ()
440
509
0 commit comments