#' @title PXSgl
#' @description this function builds a polyexposure risk score with consideration of pairwise interactions between exposures using the group lasso method
#' @source
#' @author Yixuan He, \email{yixuan_he@@hms.harvard.edu}
#' @param df the data frame input
#' @param X column name of significant exposure variables from XWAS
#' @param cov column name of covariates
#' @param mod type of model to run; 'lm' for linear regression, 'logistic' for logistic regression; 'cox' for Cox regression
#' @param IDA list of IDs to from XWAS procedure
#' @param IDB list of IDs for testing set
#' @param IDC list of IDs in the final prediction set
#' @param seed setting a seed
#' @param removes any exposure response, categorical or numerical, to remove from the analysis This should be in the form of a list
#' @param fdr whether or not to adjust for multiple hypothesis correction
#' @param intermediate whether or not to save intermediate files
#' @param folds number of folds for the cross validation step, default is 10.
#' @export
#'
#' @importFrom magrittr %>%
#' @import naniar
#' @import glmnet
#' @import broom
#' @import ggplot2
#' @import glinternet
#' @import logger
#'
PXSgl = function(df,
X,
cov,
mod,
IDA,
IDB,
IDC,
seed,
removes = NULL,
intermediate = F,
folds=10) {
`%notin%` <- Negate(`%in%`)
set.seed(seed)
#check to make sure each column has more than 1 unique value
log_info(paste('intiating group lasso PXS procedure with', length(X), 'variables'))
df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)],as.factor) #change all non-numeric columns to factor form
dfA = df[which(df$ID %in% IDA), ]
keep <- data.frame(dfA[, c(which(colnames(dfA) %in% c('PHENO', cov, X)))])
one = which(sapply(keep, function(x)
length(unique(x)) > 1) == FALSE)
if (length(one) != 0) {
log_warn(paste(
colnames(keep)[one],
'has only one unique value, please double check or remove.'
))
stop()
}
if (length(removes) != 0) {
log_info('excluding individuals...')
b=apply(keep, 1, function(r) any(r %in% removes))
if(length(which(b==TRUE))!=0){
keep=keep[-which(b==TRUE),]
}
else{
log_info('no responses to remove')
}
}
keep = na.omit(keep)
log_info(paste(nrow(keep),'individuals remain'))
###GROUP LASSO WITH GLINTERNET
#code adapted from tutorial https://strakaps.github.io/post/glinternet/
Y=keep$PHENO
keep=keep %>% dplyr::select(-PHENO)
i_num <- sapply(keep, is.numeric)
X <- keep
# get the numLevels vector containing the number of categories
numLevels <- X %>% sapply(nlevels)
numLevels[numLevels==0] <- 1
# make the categorical variables take integer values starting from 0
X[, !i_num] <- apply(X[, !i_num], 2, function(col) as.integer(as.factor(col)) - 1)
log_info(paste('cross validation with',folds,'folds'))
if(mod %in% c('cox','logistic')){
cv_fit <- glinternet::glinternet.cv(X,Y, nFolds = folds,numLevels,family='binomial')
}
if(mod %in% c('lm')){
cv_fit <- glinternet::glinternet.cv(X,Y, nFolds = folds,numLevels)
}
lamb <- which.min(cv_fit$cvErr) #picking lambda with min error
log_info(paste('the min lamda is:', cv_fit$lambda[lamb]))
coefs <- coef(cv_fit$glinternetFit)[[lamb]]
##names of main/interaction effects
###names
idx_num <- (1:length(i_num))[i_num]
idx_cat <- (1:length(i_num))[!i_num]
catnames=names(numLevels)[idx_cat[coefs$mainEffects$cat]] #categorical
contnames=names(numLevels)[idx_num[coefs$mainEffects$cont]] #continuous
if(length(catnames)==0){ #if no cat variables, set it to empty list
catnames=c()
}
if(length(contnames)==0){ #if no numeric variables, set it to empty list
contnames=c()
}
if(length(catnames) ==0 & length(contnames)==0){
log_warn ('group lasso did not select any variables, procedure terminated')
stop()
}
mains=coefs$mainEffects #get main interactions only (gives indices of cat and cont variable)
interacts=coefs$interactions #interactions (in pairs [i,j])
##################
## re-calibrate in group B
log_info('recalibrating model in group B...')
keepBC = df[which(df$ID %in% c(IDB,IDC)), c(which(colnames(df) %in% c('ID','PHENO', catnames,contnames,cov)))]
if(mod=='cox'){
keepBC = df[which(df$ID %in% c(IDB,IDC)), c(which(colnames(df) %in% c('ID','PHENO','TIME', catnames,contnames,cov)))]
}
if (length(removes) != 0) {
log_info('excluding individuals...')
b=apply(keepBC, 1, function(r) any(r %in% removes))
if(length(which(b==TRUE))!=0){
keepBC=keepBC[-which(b==TRUE),]
}
else{
log_info('no responses to remove')
}
}
keepBC = na.omit(keepBC)
log_info(paste(nrow(keepBC),'individuals remain'))
one = which(sapply(keepBC, function(x)
length(unique(x)) > 1) == FALSE)
if (length(one) != 0) {
log_warn(paste(
colnames(keepBC)[one], #####may need to only have just one here not this whole thing
'has only one unique value, removed in calibration step.'
))
}
catcat=data.frame(interacts$catcat)
contcont=data.frame(interacts$contcont)
catcont=data.frame(interacts$catcont)
if(length(catcat)!=0){
catcat$one_name=names(numLevels)[idx_cat[catcat[,1]]]
catcat$two_name=names(numLevels)[idx_cat[catcat[,2]]]
}
if(length(contcont)!=0){
contcont$one_name=names(numLevels)[idx_num[contcont[,1]]]
contcont$two_name=names(numLevels)[idx_num[contcont[,2]]]
}
if(length(catcont)!=0){
catcont$one_name=names(numLevels)[idx_cat[catcont[,1]]]
catcont$two_name=names(numLevels)[idx_num[catcont[,2]]]
}
interacts=rbind(catcat,contcont,catcont)
interacts$interact=paste(interacts$one_name,interacts$two_name,sep='*')
if(length(one!=0)){
interacts=interacts[!apply(interacts, 1, function(r) any(r %in% names(one))),] #remove any interactions with variable that has only one response
}
mains=c(catnames,contnames)
if(length(one!=0)){
mains=mains[which(mains%notin%names(one))] #remove any mains with variable that has only one response
}
form_inter=paste(interacts$interact,collapse = '+')
form_main=paste(mains,collapse = '+')
cov=paste(cov,collapse = '+')
if(mod%in%c('lm','logistic')){
f=as.formula(paste('PHENO',paste(cov,form_main,form_inter,sep='+'),sep='~'))
}
if(mod%in%c('cox')){
f=as.formula(paste('survival::Surv(TIME, PHENO)',paste(0,form_main,form_inter,sep='+'),sep='~'))
}
keepB <-keepBC[which(keepBC$ID%in%IDB),]
keepB$ID=NULL
if(mod=='lm'){
model=lm(f,data=keepB)
}
if(mod=='logistic'){
model=glm(f,data=keepB,family='binomial')
}
if(mod=='cox'){
model=survival::coxph(f,data=keepB)
}
dfC <-keepBC[which(keepBC$ID%in%IDC),]
if(mod=='lm'){
dfC$pred=predict(model,dfC)
}
if (mod=='logistic'){
dfC$pred=predict(model,dfC,type='response')
}
if (mod=='cox'){
dfC$pred=predict(model,dfC,type='risk')
}
return(dfC)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.