Skip to content

Commit 357add2

Browse files
committed
update polypred to take a weighted sum of PRS instead of a weighted sum of betas, which loses accuracy for some reason we don't understand involving plink
1 parent 7f1493d commit 357add2

File tree

1 file changed

+172
-103
lines changed

1 file changed

+172
-103
lines changed

polypred.py

Lines changed: 172 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ def compute_prs_for_file(args,
105105
plink_cmd += ' --bfile %s --score %s sum'%(plink_file_prefix, betas_file)
106106
else:
107107
raise ValueError('neither --bed nor --pgen specified')
108+
if args.center:
109+
plink_cmd += ' center'
108110
if ranges_file is not None:
109111
scores_file = os.path.join(temp_dir, next(tempfile._get_candidate_names()))
110112
df_betas[['SNP_bim', 'score']].drop_duplicates('SNP_bim').to_csv(scores_file, sep='\t', header=False, index=False)
@@ -175,6 +177,7 @@ def load_betas_files(betas_file, verbose=True):
175177

176178
#rename columns if needed
177179
df_betas.rename(columns={'sid':'SNP', 'nt1':'A1', 'nt2':'A2', 'BETA_MEAN':'BETA', 'ldpred_inf_beta':'BETA', 'chrom':'CHR', 'Chrom':'CHR', 'pos':'BP'}, inplace=True, errors='ignore')
180+
178181
if not is_numeric_dtype(df_betas['CHR']):
179182
if df_betas['CHR'].str.startswith('chrom_').all():
180183
df_betas['CHR'] = df_betas['CHR'].str[6:].astype(np.int64)
@@ -263,124 +266,191 @@ def computs_prs_all_files(args, betas_file, disable_jackknife=False, keep_file=N
263266

264267

265268

266-
def estimate_mixing_weights(args):
269+
def compute_prs(args):
270+
271+
#if we need to perform predictions, make sure the mixweights file is found
272+
if args.predict and args.betas.count(',') > 0:
273+
mixweights_file = args.mixweights_prefix +'.mixweights'
274+
if not os.path.exists(mixweights_file):
275+
raise ValueError('mixweights file %s not found'%(mixweights_file))
267276

268-
#read phenotypes
269-
df_pheno = pd.read_csv(args.pheno, names=['FID', 'IID', 'PHENO'], index_col='IID', delim_whitespace=True)
270-
271-
#make sure that we didn't include a header line
272-
try:
273-
float(df_pheno['PHENO'].iloc[0])
274-
except:
275-
df_pheno = df_pheno.iloc[1:]
276-
df_pheno['PHENO'] = df_pheno['PHENO'].astype(np.float64)
277-
if np.any(df_pheno.index.duplicated()):
278-
raise ValueError('duplicate ids found in %s'%(args.pheno))
279277

280278
#compute a PRS for each beta file
281279
beta_files = args.betas.split(',')
282-
df_prs_sum_list = []
280+
df_prs_list = []
283281
for betas_file in beta_files:
284-
df_prs_sum = computs_prs_all_files(args, betas_file, disable_jackknife=True, keep_file=args.pheno)
285-
df_prs_sum_list.append(df_prs_sum[['SCORESUM']])
286-
for df_prs_sum in df_prs_sum_list:
287-
assert np.all(df_prs_sum.index == df_prs_sum_list[0].index)
288-
df_prs_sum_all = pd.concat(df_prs_sum_list, axis=1)
289-
290-
#sync df_pheno and df_prs_sum_all
291-
df_prs_sum_all.index = df_prs_sum_all.index.astype(str)
292-
df_pheno.index = df_pheno.index.astype(str)
293-
index_shared = df_prs_sum_all.index.intersection(df_pheno.index)
294-
assert len(index_shared)>0
295-
if len(index_shared) < df_prs_sum_all.shape[0]:
296-
df_prs_sum_all = df_prs_sum_all.loc[index_shared]
297-
if df_pheno.shape[0] != df_prs_sum_all.shape[0] or np.any(df_prs_sum_all.index != df_pheno.index):
298-
df_pheno = df_pheno.loc[df_prs_sum_all.index]
282+
df_prs = computs_prs_all_files(args, betas_file, disable_jackknife=not args.predict, keep_file=args.pheno)
283+
df_prs_list.append(df_prs)
284+
for df_prs in df_prs_list:
285+
assert np.all(df_prs.index == df_prs_list[0].index)
286+
df_prs_all = pd.concat(df_prs_list, axis=1)
287+
288+
289+
#compute mixing weights if needed
290+
if args.estimate_mixweights:
291+
292+
#read phenotypes
293+
df_pheno = pd.read_csv(args.pheno, names=['FID', 'IID', 'PHENO'], index_col='IID', delim_whitespace=True)
294+
295+
#make sure that we didn't include a header line
296+
try:
297+
float(df_pheno['PHENO'].iloc[0])
298+
except:
299+
df_pheno = df_pheno.iloc[1:]
300+
df_pheno['PHENO'] = df_pheno['PHENO'].astype(np.float64)
301+
if np.any(df_pheno.index.duplicated()):
302+
raise ValueError('duplicate ids found in %s'%(args.pheno))
303+
304+
#sync df_pheno and df_prs_all
305+
df_prs_all.index = df_prs_all.index.astype(str)
306+
df_pheno.index = df_pheno.index.astype(str)
307+
index_shared = df_prs_all.index.intersection(df_pheno.index)
308+
assert len(index_shared)>0
309+
if len(index_shared) < df_prs_all.shape[0]:
310+
df_prs_all = df_prs_all.loc[index_shared]
311+
if df_pheno.shape[0] != df_prs_all.shape[0] or np.any(df_prs_all.index != df_pheno.index):
312+
df_pheno = df_pheno.loc[df_prs_all.index]
313+
314+
#extract just the SCORESUM columns
315+
df_prs_sum_all = df_prs_all['SCORESUM'].copy()
299316

300-
#flip PRS that are negatively correlated with the phenotype
301-
is_flipped = np.zeros(df_prs_sum_all.shape[1], dtype=bool)
302-
linreg_univariate = LinearRegression()
303-
for c_i in range(df_prs_sum_all.shape[1]):
304-
linreg_univariate.fit(df_prs_sum_all.iloc[:, [c_i]], df_pheno['PHENO'])
305-
is_flipped[c_i] = linreg_univariate.coef_[0] < 0
306-
df_prs_sum_all.loc[:, is_flipped] *= -1
307-
308-
#compute mixing weights
309-
linreg = LinearRegression(positive = not args.allow_neg_mixweights)
310-
linreg.fit(df_prs_sum_all, df_pheno['PHENO'])
311-
mix_weights, intercept = linreg.coef_, linreg.intercept_
312-
r2_score = metrics.r2_score(df_pheno['PHENO'], linreg.predict(df_prs_sum_all))
313-
logging.info('In-sample R2: %0.3f'%(r2_score))
314-
315-
#create and print df_coef, and save it to disk
316-
df_coef = pd.Series(mix_weights, index=beta_files)
317-
df_coef.loc['intercept'] = intercept
318-
mix_weights_file = args.output_prefix+'.mixweights'
319-
df_coef.to_frame(name='mix_weight').to_csv(mix_weights_file, sep='\t')
320-
logging.info('Writing mixing weights to %s'%(mix_weights_file))
321-
322-
#compute weighted betas
323-
df_betas_weighted = None
324-
for is_flipped_beta, betas_file, mix_weight in zip(is_flipped, beta_files, mix_weights):
325-
df_betas = load_betas_files(betas_file)
326-
df_betas = df_betas[['SNP', 'CHR', 'BP', 'A1', 'A2', 'BETA']]
327-
df_betas['BETA'] *= mix_weight
328-
if is_flipped_beta: df_betas['BETA'] = -df_betas['BETA']
329-
if df_betas_weighted is None:
330-
df_betas_weighted = df_betas
331-
continue
317+
#flip PRS that are negatively correlated with the phenotype
318+
is_flipped = np.zeros(df_prs_sum_all.shape[1], dtype=bool)
319+
linreg_univariate = LinearRegression()
320+
for c_i in range(df_prs_sum_all.shape[1]):
321+
linreg_univariate.fit(df_prs_sum_all.iloc[:, [c_i]], df_pheno['PHENO'])
322+
is_flipped[c_i] = linreg_univariate.coef_[0] < 0
323+
df_prs_sum_all.loc[:, is_flipped] *= -1
324+
325+
#estimate mixing weights
326+
linreg = LinearRegression(positive = not args.allow_neg_mixweights)
327+
linreg.fit(df_prs_sum_all, df_pheno['PHENO'])
328+
mix_weights, intercept = linreg.coef_, linreg.intercept_
329+
r2_score = metrics.r2_score(df_pheno['PHENO'], linreg.predict(df_prs_sum_all))
330+
logging.info('In-sample R2: %0.3f'%(r2_score))
332331

333-
index_shared = df_betas.index.intersection(df_betas_weighted.index)
334-
df_betas['BETA2'] = df_betas['BETA']
335-
df_new = df_betas_weighted.loc[index_shared].merge(df_betas.loc[index_shared, ['BETA2']], left_index=True, right_index=True)
336-
df_new['BETA'] += df_new['BETA2']
337-
del df_new['BETA2']
338-
del df_betas['BETA2']
339-
df_list = [df_new, df_betas.loc[~df_betas.index.isin(index_shared)], df_betas_weighted.loc[~df_betas_weighted.index.isin(index_shared)]]
340-
df_betas_weighted = pd.concat(df_list, axis=0)
341-
df_betas_weighted.sort_values(['CHR', 'BP', 'A1'], inplace=True)
342-
343-
#save output to file
344-
df_betas_weighted[['SNP', 'CHR', 'BP', 'A1', 'A2', 'BETA']].to_csv(args.output_prefix+'.betas', sep='\t', index=False, float_format='%0.6e')
345-
logging.info('Saving weighted betas to %s'%(args.output_prefix+'.betas'))
332+
#create and print df_coef, and save it to disk
333+
df_coef = pd.Series(mix_weights, index=beta_files)
334+
df_coef.loc[is_flipped] *= -1
335+
df_coef.loc['intercept'] = intercept
336+
mix_weights_file = args.output_prefix+'.mixweights'
337+
df_coef.to_frame(name='mix_weight').to_csv(mix_weights_file, sep='\t')
338+
logging.info('Writing mixing weights to %s'%(mix_weights_file))
339+
340+
#flip the PRS back
341+
df_prs_sum_all.loc[:, is_flipped] *= -1
346342

347-
348-
def compute_prs(args):
343+
#perform predictions
344+
if args.predict:
345+
346+
#extract just the SCORESUM columns
347+
df_prs_sum_all = df_prs_all['SCORESUM']
348+
349+
#just take the PRS if there's only a single beta
350+
if args.betas.count(',') == 0:
351+
assert (df_prs_all.columns=='SCORESUM').sum() == 1
352+
s_combined_prs = df_prs_sum_all
353+
354+
#if there's more than one beta, take the linear combination
355+
else:
356+
mixweights_file = args.mixweights_prefix +'.mixweights'
357+
s_mixweights = pd.read_csv(mixweights_file, delim_whitespace=True, squeeze=True)
358+
if np.any(s_mixweights.index[:-1] != args.betas.split(',')):
359+
raise ValueError('The provided betas file do not match the mix weights file')
360+
assert s_mixweights.index[-1] == 'intercept'
361+
s_combined_prs = df_prs_sum_all.dot(s_mixweights.iloc[:-1].values) + s_mixweights.loc['intercept']
362+
363+
#save the PRS to disk
364+
df_prs_sum = s_combined_prs.reset_index(drop=False)
365+
df_prs_sum.columns = ['IID', 'PRS']
366+
df_prs_sum['FID'] = df_prs_sum['IID']
367+
df_prs_sum = df_prs_sum[['FID', 'IID', 'PRS']]
368+
df_prs_sum.to_csv(args.output_prefix+'.prs', sep='\t', index=False, float_format='%0.5f')
369+
370+
#handle jackknife
371+
set_jk_columns = set([c for c in df_prs_all.columns if '.jk' in c])
372+
df_prs_sum_jk = pd.DataFrame(index=df_prs_all.index, columns=set_jk_columns)
373+
if df_prs_sum_jk.shape[1] > 1:
374+
for jk_column in set_jk_columns:
375+
if args.betas.count(',') == 0:
376+
assert (df_prs_all.columns==jk_column).sum() == 1
377+
df_prs_sum_jk[jk_column] = df_prs_all[jk_column]
378+
else:
379+
#import ipdb; ipdb.set_trace()
380+
df_prs_sum_jk[jk_column] = df_prs_all[jk_column].dot(s_mixweights.iloc[:-1].values) + s_mixweights.loc['intercept']
381+
382+
df_prs_sum_jk.reset_index().to_csv(args.output_prefix+'.prs_jk', sep='\t', index=False, float_format='%0.5f')
383+
384+
logging.info('Saving PRS to %s'%(args.output_prefix+'.prs'))
385+
386+
387+
388+
389+
# #compute weighted betas
390+
# df_betas_weighted = None
391+
# for is_flipped_beta, betas_file, mix_weight in zip(is_flipped, beta_files, mix_weights):
392+
# df_betas = load_betas_files(betas_file)
393+
# df_betas = df_betas[['SNP', 'CHR', 'BP', 'A1', 'A2', 'BETA']]
394+
# df_betas['BETA'] *= mix_weight
395+
# if is_flipped_beta: df_betas['BETA'] = -df_betas['BETA']
396+
# if df_betas_weighted is None:
397+
# df_betas_weighted = df_betas
398+
# continue
399+
400+
# index_shared = df_betas.index.intersection(df_betas_weighted.index)
401+
# df_betas['BETA2'] = df_betas['BETA']
402+
# df_new = df_betas_weighted.loc[index_shared].merge(df_betas.loc[index_shared, ['BETA2']], left_index=True, right_index=True)
403+
# df_new['BETA'] += df_new['BETA2']
404+
# del df_new['BETA2']
405+
# del df_betas['BETA2']
406+
# df_list = [df_new, df_betas.loc[~df_betas.index.isin(index_shared)], df_betas_weighted.loc[~df_betas_weighted.index.isin(index_shared)]]
407+
# df_betas_weighted = pd.concat(df_list, axis=0)
408+
# df_betas_weighted.sort_values(['CHR', 'BP', 'A1'], inplace=True)
409+
410+
# #save weighted betas to file
411+
# df_betas_weighted[['SNP', 'CHR', 'BP', 'A1', 'A2', 'BETA']].to_csv(args.output_prefix+'.betas', sep='\t', index=False, float_format='%0.6e')
412+
# logging.info('Saving weighted betas to %s'%(args.output_prefix+'.betas'))
349413

350-
if args.betas.count(',') > 0:
351-
raise ValueError('--predict can only be used with a single betas file')
352-
df_prs_sum = computs_prs_all_files(args, args.betas, disable_jackknife=False, keep_file=args.keep)
353-
df_prs_sum.reset_index(inplace=True, drop=False)
354-
df_prs_sum.columns = df_prs_sum.columns.str.replace('SCORESUM', 'PRS')
355-
df_prs_sum_main = df_prs_sum[['FID', 'IID', 'PRS']]
356-
df_prs_sum_jk = df_prs_sum[['FID', 'IID'] + [c for c in df_prs_sum.columns if c.startswith('PRS.')]]
357-
358-
df_prs_sum_main.to_csv(args.output_prefix+'.prs', sep='\t', index=False, float_format='%0.5f')
359-
if df_prs_sum_jk.shape[1]>1:
360-
df_prs_sum_jk.to_csv(args.output_prefix+'.prs_jk', sep='\t', index=False, float_format='%0.5f')
361-
logging.info('Saving PRS to %s'%(args.output_prefix+'.prs'))
414+
415+
# def compute_prs(args):
416+
417+
# if args.betas.count(',') > 0:
418+
# raise ValueError('--predict can only be used with a single betas file')
419+
# df_prs_sum = computs_prs_all_files(args, args.betas, disable_jackknife=False, keep_file=args.keep)
420+
# df_prs_sum.reset_index(inplace=True, drop=False)
421+
# df_prs_sum.columns = df_prs_sum.columns.str.replace('SCORESUM', 'PRS')
422+
# df_prs_sum_main = df_prs_sum[['FID', 'IID', 'PRS']]
423+
# df_prs_sum_jk = df_prs_sum[['FID', 'IID'] + [c for c in df_prs_sum.columns if c.startswith('PRS.')]]
424+
425+
# df_prs_sum_main.to_csv(args.output_prefix+'.prs', sep='\t', index=False, float_format='%0.5f')
426+
# if df_prs_sum_jk.shape[1]>1:
427+
# df_prs_sum_jk.to_csv(args.output_prefix+'.prs_jk', sep='\t', index=False, float_format='%0.5f')
428+
# logging.info('Saving PRS to %s'%(args.output_prefix+'.prs'))
362429

363430

364431
def check_args(args):
365-
if int(args.predict) + int(args.combine_betas) != 1:
366-
raise ValueError('you must specify either --predict or --combine-betas (but not both)')
432+
if int(args.predict) + int(args.estimate_mixweights) != 1:
433+
raise ValueError('you must specify either --predict or --estimate-mixweights (but not both)')
367434
if args.plink_exe is None and args.plink2_exe is None:
368435
raise ValueError('you must specify either --plink-exe or --plink2-exe')
369436
if args.plink_exe is not None and not os.path.exists(args.plink_exe):
370437
raise ValueError('%s not found'%(args.plink_exe))
371438
if args.plink2_exe is not None and not os.path.exists(args.plink2_exe):
372439
raise ValueError('%s not found'%(args.plink2_exe))
373-
if args.combine_betas:
440+
if args.estimate_mixweights:
374441
if args.keep is not None:
375-
raise ValueError('you cannot provide both --combine-betas and --keep')
442+
raise ValueError('you cannot provide both --estimate-mixweights and --keep')
376443
if args.pheno is None:
377-
raise ValueError('you must provide --pheno if you specify --combine-betas')
444+
raise ValueError('you must provide --pheno if you specify --estimate-mixweights')
378445
if args.betas.count(',')==0:
379-
raise ValueError('you must provide multiple files in --betas if you specify --combine-betas')
446+
raise ValueError('you must provide multiple files in --betas if you specify --estimate-mixweights')
447+
if args.predict:
448+
if args.mixweights_prefix is None and args.betas.count(',') > 0:
449+
raise ValueError('you must provide --mixweights-prefix together with --predict if you have more than one beta file')
380450
if args.num_jk<0:
381451
raise ValueError('--num-jk must be >=0')
382452
if args.pheno is not None and args.predict:
383-
raise ValueError('--pheno can only be used with --combine-betas')
453+
raise ValueError('--pheno can only be used with --estimate-mixweights')
384454

385455
if len(list(args.files)) == 0:
386456
raise ValueError('no input files specified')
@@ -391,9 +461,10 @@ def check_args(args):
391461
parser = argparse.ArgumentParser()
392462

393463
parser.add_argument('--betas', required=True, help='files with SNP effect sizes (comma separated). A1 is the effect allele.')
464+
parser.add_argument('--mixweights-prefix', help='Prefix of files with mixing weights (required if you use --predict with more than one betas file')
394465
parser.add_argument('--output-prefix', required=True, help='Prefix of output file')
395466

396-
parser.add_argument('--combine-betas', default=False, action='store_true', help='If specified, PolyPred will estimate mixing weights')
467+
parser.add_argument('--estimate-mixweights', default=False, action='store_true', help='If specified, PolyPred will estimate mixing weights')
397468
parser.add_argument('--allow-neg-mixweights', default=False, action='store_true', help='If specified, PolyPred will not enforce non-negative mixing weights')
398469
parser.add_argument('--predict', default=False, action='store_true', help='If specified, PolyPred will compute PRS')
399470
parser.add_argument('--pheno', default=None, help='Phenotype file (required for estimating mixing weights)')
@@ -403,6 +474,7 @@ def check_args(args):
403474
parser.add_argument('--extract', default=None, help='A text file with rsids of SNPs to use (one per line)')
404475
parser.add_argument('--keep', default=None, help='A text file with ids of individuals to use (two columns per line, each containing FID,IID)')
405476
parser.add_argument('--num-jk', type=int, default=200, help='number of genomic jackknife blocks')
477+
parser.add_argument('--center', default=False, action='store_true', help='If specified, the PRS will be centered')
406478

407479
parser.add_argument('--memory', type=int, default=2, help='Maximum memory usage (in GB)')
408480
parser.add_argument('--threads', type=int, default=1, help='Number of CPU threads')
@@ -420,21 +492,18 @@ def check_args(args):
420492

421493
#check that the output directory exists
422494
if len(os.path.dirname(args.output_prefix))>0 and not os.path.exists(os.path.dirname(args.output_prefix)):
423-
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.output_prefix)))
495+
raise ValueError('output directory %s doesn\'t exist'%(os.path.dirname(args.output_prefix)))
496+
497+
424498

425499
#configure logger
426500
configure_logger(args.output_prefix)
427501

428502
#check arguments
429503
check_args(args)
430504

431-
#estimate mixing weights if needed
432-
if args.combine_betas:
433-
estimate_mixing_weights(args)
434-
435-
#compute PRS if needed
436-
if args.predict:
437-
compute_prs(args)
505+
#Estimate mixiwing weights and/or compute PRS
506+
compute_prs(args)
438507

439508
print()
440509

0 commit comments

Comments
 (0)