#This code separates the IQ 4yrs data and related PRS scores for the different individuals into a train, test and validate set
#as a tool to obtain the most appropriate SAIL model.This will be used as a "real" data example for Sahir's manuscript to illustrate
#the uses of the SAIL algorithm. We also remove any outliers in order to improve legibility of the graphs.
#This is the code used to generate the figures for the manuscript
# load packages ---------------------------------------------------------
# rm(list=ls())
pacman::p_load(simulator) # this file was created under simulator version 0.2.1
# devtools::load_all()
pacman::p_load(data.table)
pacman::p_load(magrittr)
# pacman::p_load(genefilter)
pacman::p_load(tidyverse)
pacman::p_load(doParallel)
pacman::p_load(splines)
pacman::p_load(foreach)
pacman::p_load(doMC)
pacman::p_load(glmnet)
pacman::p_load_current_gh('sahirbhatnagar/glinternet')
pacman::p_load_gh('sahirbhatnagar/sail')
pacman::p_load(LassoBacktracking)
pacman::p_load(SAM)
pacman::p_load(gamsel)
pacman::p_load_gh('asadharis/HierBasis')
pacman::p_load(psych) # for error.crosses plot
pacman::p_load(ggplot2)
# pacman::p_load(sortinghat)
# pacman::p_load(sail)
# pacman::p_load(ggplot2)
# load source code helper files -------------------------------------------
#Splits data evenly between train and validate (train=60,valid=60,test=50)
#source("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/Script/05_model_functions.R")
#Uneven split of train and validate datasets (train=110,valid=40,test=20)
# source("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/Script/05_model_functions_2.R")
# source("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/Script/05_method_functions.R")
# source("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/Script/05_eval_functions.R")
# source("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/Script/05_plot_functions_rda.R")
source("~/git_repositories/sail/rda/PRS/05_model_functions_2.R")
source("~/git_repositories/sail/rda/PRS/05_method_functions.R")
source("~/git_repositories/sail/rda/PRS/05_eval_functions.R")
source("~/git_repositories/sail/rda/PRS/05_plot_functions_rda.R")
#######################################################################################################################################################
######Load data and format appropriately
gen3pc=read.table("~/git_repositories/sail/rda/PRS/Gen_3PC_scores.txt", header=TRUE)
#Missing one column of info: ### #eigvals: 8.654 1.217 1.198 ### (might be important)
gen3pc=cbind(gen3pc,rownames(gen3pc))
colnames(gen3pc)[4]="SentrixID"
iq_md=read.table("~/git_repositories/sail/rda/PRS/IQ_and_mental_development_variables_for_Sahir_with_study_ID.txt", header=TRUE)
snp_prs_na=read.table("~/git_repositories/sail/rda/PRS/NFP_170614_INFO08_nodup_hard09_noambi_GWAS_EduYears_Pooled_beta_withaf_5000pruned_noambi_16Jan2018.score", header=TRUE,sep=",")
######Merge the iq_md (SentrixID), snp_prs_na (Label.1) and gen3pc (SentrixID) datasets together
m1=merge(iq_md,snp_prs_na,by.x="SentrixID",by.y="Label.1")
IQdat=merge(m1,gen3pc,by.x="SentrixID",by.y="SentrixID")
###Keep relavant columns
IQ4y=
subset(IQdat, select=c("IQ_4yrs", #Y variable
"Tx_group_bin", #E variable
"PRS_0.0001","PRS_0.001","PRS_0.01","PRS_0.05","PRS_0.1","PRS_0.2","PRS_0.3","PRS_0.4","PRS_0.5")) #X variables
pacman::p_load(mice)
pt <- mice::mice(IQ4y, m = 1, seed = 1234)
IQ4y <- mice::complete(pt)
IQ4y <- as.matrix(IQ4y)
rownames(IQ4y)=IQdat[,"SentrixID"]
IQ4y <- IQ4y[!is.na(IQ4y[,"IQ_4yrs"]),] #remove rows with missing IQ values
# these are used for plotting. we use the entire data set for the plots --------------------
DT <- IQ4y
X <- IQ4y %>% as.matrix()
Xnorm <- sail:::standardize(X, center = TRUE, normalize = TRUE)$x
e <- IQ4y[, "Tx_group_bin"]
Y <- DT[,"IQ_4yrs"] %>% as.numeric
#######################################################################################################################################################
######Generate the 200 simulations and test them against sail, lasso, lassobt, hierbasis and glinternetsplit
#-----------------
#4 methods were run here (sail, lasso, lassobt, hierbasis)
#Glinternetsplit did not converge (you can add glinternetsplit as one of the factors in run_method to see for yourself)
#When plots of mse and number of avtive variables were fun for these 4 methods, we noticed that there were some outliers.
#These outliers made the plots more difficult to interpret. As such, the next step was to identify which of the 210 simulation
#resulted in these results and remove them from the analysis
#sim <- new_simulation(name = "IQ_4yrs_cond2_split2_4",
# label = "IQ_4yrs_cond2_split2_4",
# dir = ".") %>%
# generate_model(make_data_split, seed = 12254,
# phenoVariable = "IQ_4yrs",
# exposure = "Tx_group_bin",n_train_test = 150) %>%
# simulate_from_model(nsim = 6, index = 1:35) %>%
# run_method(list(lassosplit, sailsplit, lassoBTsplit, Hiersplit),
# parallel = list(socket_names = 35,#refers to the number of cores on which to split the simulation
# libraries = c("LassoBacktracking", "glinternet","glmnet","splines",
# "magrittr","sail","gamsel","SAM","HierBasis","simulator", "parallel")))
#
#sim <- sim %>% evaluate(list(msevalid, nactive, r2))
#
#save(sim,file="/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/files/IQ_4yrs_cond2_s2_4.Rdata")
##########Here we run the 4 simulations (but wait before running the 4 methods)
#Create the 210 simulations (different splits of train/validate/test)
sim <- new_simulation(name = "IQ_4yrs_cond2_split2_4",
label = "IQ_4yrs_cond2_split2_4",
dir = "my_sims/") %>%
generate_model(make_data_split, seed = 12254,
phenoVariable = "IQ_4yrs",
exposure = "Tx_group_bin",n_train_test = 150) %>%
simulate_from_model(nsim = 6, index = 1:35)
#Now, we run the 4 methods save the results:
sim <- run_method(sim,list(lassosplit, sailsplit,sailsplitadaptive, sailsplitadaptiveweak,sailsplitweak, lassoBTsplit, Hiersplit),
parallel = list(socket_names = 40,#refers to the number of cores on which to split the simulation
libraries = c("LassoBacktracking", "glmnet","splines",
"magrittr","sail","gamsel","SAM","HierBasis","simulator", "parallel"))) %>%
evaluate(list(msevalid, nactive, r2))
save_simulation(sim)
as.data.frame(evals(sim))
ls()
sim <- load_simulation(name = "PRS_IQ_4yrs_cond2_split2_4", dir = "my_sims/")
plot_eval(sim, "mse")
sim %>% subset_simulation(methods = c("lasso", "sail","sailweak",
"Adaptivesail", "Adaptivesailweak",
"lassoBT")) %>% plot_eval("nactive")
sim %>% subset_simulation(methods = c("lasso", "sail","sailweak",
"Adaptivesail", "Adaptivesailweak",
"lassoBT")) %>% plot_eval("mse")
# save(sim,file="/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/files/IQ_4yrs_cond2_s2_4.Rdata")
sim@output_refs
simulator::get_contents(dir = "my_sims/")
simulator::load_outputs(dir = "my_sims/",
model_name = "IQ4y_data_split_Cond2_split2_4/exposure_Tx_group_bin/n_train_test_150/phenoVariable_IQ_4yrs/",
method_name = "sail")
index <- paste0("r",rep(1:35,each=6),".",1:6)
# the 2 is the index for sail
COEFlist <- lapply(index, function(i) {
simulator::output(sim)[[2]]@out[[i]][["beta"]]
})
#######################################################################################################################################################
######Plot the results
# load simulator object for plots -----------------------------------------
# load("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/files/IQ_4yrs_cond2_s2_4.Rdata")
# df <- as.data.frame(evals(sim))
# saveRDS(df, file = "~/git_repositories/sail/my_sims/rda_results/rda_IQ_4yrs_cond2_s2_4.rds")
# plot results ------------------------------------------------------------
# sim %>% plot_eval(metric_name = "nactive")
# sim %>% plot_eval(metric_name = "mse")
# sim %>% plot_eval(metric_name = "r2")
## ---- load-results
#df <- readRDS("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/files/IQ_4yrs_cond2_s2_4.Rdata")
df <- readRDS(file = "~/git_repositories/sail/my_sims/rda_results/rda_IQ_4yrs_cond2_s2_4.rds")
DT_res <- df %>% as.data.table()
## ---- get-best-sail-model ---- (the one that results in the lowest MSE)
# get best model
draw_ind <- DT_res[Method=="sail"][which.min(mse)]$Draw
# draw random sample
draw_ind <- sample(DT_res[Method=="sail"]$Draw, size = 1)
dat <- draws(sim)@draws[[draw_ind]]
saveRDS(dat, file = "~/git_repositories/sail/my_sims/rda_results/rda_IQ_4yrs_cond2_s2_4_data.rds")
dat <- readRDS("~/git_repositories/sail/my_sims/rda_results/rda_IQ_4yrs_cond2_s2_4_data.rds")
# devtools::load_all()
# the bootstrap results are based on strong=FALSE, alpha = 0.1, degree=3
fit <- sail(x = dat$xtrain, y = dat$ytrain, e = dat$etrain,
basis = function(i) splines::bs(i, degree = 3),
expand = TRUE,
center.x = T,
center.e = T,
#group = dat$group,
alpha = 0.1,
#maxit = 250,
strong = FALSE,
verbose = 2
)
plot(fit)
fit
ytest_hat <- predict(fit, newx = dat$xtest, newe = dat$etest)
msetest <- colMeans((dat$ytest - ytest_hat)^2)
lambda.min.index <- as.numeric(which.min(msetest))
lambda.min <- fit$lambda[which.min(msetest)]
yvalid_hat <- predict(fit, newx = dat$xvalid, newe = dat$evalid, s = lambda.min)
msevalid <- mean((dat$yvalid - drop(yvalid_hat))^2)
(nzcoef <- predict(fit, s = lambda.min, type = "nonzero"))
design <- do.call(rbind, list(dat$xtrain, dat$xtest, dat$xvalid))
#design2=sail:::design_sail(design,e,expand=TRUE,basis = function(i) splines::bs(i, degree = 3),center.e=TRUE,center.x=TRUE,nvars=ncol(design),vnames=colnames(design))$design
#Get the fit for the entire dataset (no special selection of the best model- just simple application of sail to the dataset)
fit <- sail(x = X[,-c(1,2)], y = Y, e = IQ4y[, "Tx_group_bin"],
basis = function(i) splines::bs(i, degree = 3),
expand = TRUE,
center.x = T,
center.e = T,
#group = dat$group,
alpha = 0.1,
#maxit = 250,
strong = FALSE,
verbose = 1
)
library(doMC)
registerDoMC(cores = 10)
fitcv <- cv.sail(x = X[,-c(1,2)], y = Y, e = IQ4y[, "Tx_group_bin"],
basis = function(i) splines::bs(i, degree = 3),
expand = TRUE,
center.x = T,
center.e = T,
#group = dat$group,
alpha = 0.1,
#maxit = 250,
strong = FALSE,
verbose = 1,
parallel = TRUE
)
plot(fitcv)
coef(fitcv, s= "lambda.min")
## ----error-crosses plots----
affect.mat2 <- describeBy(DT_res[, c("mse","nactive")], DT_res$Method, mat = TRUE)
affect.mat2 <- affect.mat2[which(affect.mat2$group1 %in% c("Adaptivesailweak","lasso","lassoBT","HierBasis")),]
par(family="serif")
error.crosses(affect.mat2[grep("nactive", rownames(affect.mat2)),],
affect.mat2[grep("mse", rownames(affect.mat2)),],
labels=unique(affect.mat2$group1),
xlab="Number of Active Variables",
main = "IQ Data: Means (+/- 1 SE) from 200 Train/Validate/Test Splits",
sd = FALSE,
cex.lab = 1.4,
cex.axis = 1.4,
cex.main = 1.5,
xlim = c(0, 10),
# ylim=c(100,250),
#ylim=c(-1000,1000),
ylab="Test Set MSE",
colors = sail:::cbbPalette[c(3,4,7,2)],
pch=16, cex=2)
#########################
#Plot interactions separately for the dichotomous Environment variable
plotInterPRS(object = fitcv$sail.fit,
originalDataNotCentered = IQ4y, # complete original data which contains both "x" and "e"
xvar = "PRS_0.0001",
s = fitcv$lambda.min,
design = fitcv$sail.fit$design, # this contains the design matrix from sail
evar = "Tx_group_bin", # this is the name of the E column in 'originalDataNotCentered'
xlab = "PRS_0.0001",
degree = 3,
ylab = "Marginal Risk",
legend.position = "bottomright",
main = "Effect of Intervention and PRS
on IQ at 4 years of age",
rug = TRUE,
color = sail:::cbbPalette[c(2,4)],
legend = TRUE)
# not used underneath this line -------------------------------------------
#To extrapolate the entire dataset (not just the training set on which we fitted)
#Uncomment the following lines
#object=fit_t #fit on the entire dataset
#x=X[,-c(1,2)]
#To extrapolate the just the training set on which we fitted
#Uncomment the following lines
object=fit #fit on the training dataset
x=dat$xtrain
object = fitcv$sail.fit
#Define important variables
i=1 #(since we are interested in PRS_0.0001 which is the first of the covariates, i=1)
cv=c("IQ4y")
xv=colnames(get(cv[1]))[-c(1,2)] #Remove the first 2 columns that contain the response and environment variable
xvar = xv[i]
# s = lambda.min
s <- fitcv$lambda.min
xlab=xv[i]
ylab=paste0("f(",xv[i],")")
color=c("orange","blue")
#Extract relevant coefficients
ind <- object$group == which(object$vnames == xvar)
allCoefs <- coef(object, s = s)
a0 <- allCoefs[1, ]
betas <- as.matrix(allCoefs[object$main.effect.names[ind], , drop = FALSE])
alphas <- as.matrix(allCoefs[object$interaction.names[ind], , drop = FALSE])
betaE <- as.matrix(allCoefs["E", , drop = FALSE])
#####Extract the design matrix from the fit object (from the training dataset - 110 individuals, from the entire dataset-170 individuals)
design.mat.main <- object$design[, object$main.effect.names[ind], drop = FALSE]
design.mat.int <- object$design[, object$interaction.names[ind], drop = FALSE]
ind2 <- which(object$vnames %in% xvar)
originalE <- object$design[, "E", drop = FALSE]
originalX <- X[,xvar,drop=FALSE]
#Obtain the associated IQ value for the corresponding PRS score and environment status
f.hat= drop(originalE * as.vector(betaE) +
design.mat.main %*% betas + design.mat.int %*% alphas)
ylims <- range(f.hat)
#Separate data according to the Environement variable(control or intervention)
intervention = max(drop(unique(originalE)))
control = min(drop(unique(originalE)))
cont_index <- which(originalE==control)
int_index <- which(originalE==intervention)
cont_pred <- f.hat[cont_index]
int_pred <- f.hat[int_index]
#Plot the effects
min.length.top <- range(f.hat)[1] ; max.length.top <- range(f.hat)[2]
# par(mai=c(1,1,1,0.2))
plot(originalX, f.hat,
pch = 19,
ylab = ylab,
xlab = xlab,
bty="n",
xaxt="n",
type = "n",
# cex.lab = 2,
# cex.axis = 2,
# cex = 2,
main = "Effect of Intervention and PRS
on IQ at 4 years of age"
# cex.main = 2.5,
# ylim = c(min.length.top-3, max.length.top+3),
# ylim = ylims
)
# axis(1, labels = T, cex.axis = 2)
axis(1, labels = T)
lines(originalX[cont_index][order(originalX[cont_index])], cont_pred[order(originalX[cont_index])], col = color[1], lwd = 3)
lines(originalX[int_index][order(originalX[int_index])], int_pred[order(originalX[int_index])], col = color[2], lwd = 3)
legend("bottomright",c("Intervention","Control"),col=c(color[2],color[1]),lwd=c(3,3))
plotInterADNI(object=fit, x=originalX, xvar = xv[i], s = lambda.min,
design = fit$basis(fit$design[,1,drop=F]), # this contains user defined expand matrix
e = originalE, # this is E vector for whole sample
# xlab , ylab ,
legend.position = "bottomleft", main = "", rug = TRUE,
color = sail:::cbbPalette[c(6,4,7)], legend = TRUE)
#####################$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#####################$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#####################$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#########Extra... looking at main and interaction effects for all variables
############
####Plot Main effects (fit=fit with training dataset giving lowest MSE ; fit_t=fit with the entire dataset)
######At lambda.min
par(mfrow=c(2,2))
cv=c("IQ4y")
xv=colnames(get(cv[1]))[-c(1,2)]
for(j in 1:length(cv)){
pdf(paste0("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/DATA/IQ and mental development/",cv[j],"_main_c2_2.pdf"))
for(i in 1:length(xv)){
plotMain(fit, x = dat$xtrain, e = dat$etrain_lasso,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Train plot")
plotMain(fit, x = dat$xtest, e = dat$etest_lasso,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Test plot")
plotMain(fit, x = dat$xvalid, e = dat$evalid,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Validate plot")
plotMain(fit_t, x = X[,-c(1,2)], e = e,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Full dataset plot")
}
dev.off()
}
############
####Plot Inter effects (fit=fit with training dataset giving lowest MSE ; fit_t=fit with the entire dataset)
######At lambda.min (based on the best sail model)
cv=c("IQ4y")
xv=colnames(get(cv[1]))[-c(1,2)]
for(j in 1:length(cv)){
pdf(paste0("/mnt/GREENWOOD_JBOD1/GREENWOOD_BACKUP/home/amanda.lovato/DNA methylation/Sahir/DATA/IQ and mental development/",cv[j],"_inter_c2_2.pdf"))
for(i in 1:length(xv)){
try( plotInter(object = fit, x = dat$xtrain, e = dat$etrain_lasso,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Train plot") )
try( plotInter(object = fit, x = dat$xtest, e = dat$etest_lasso,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Test plot") )
try( plotInter(object = fit, x = dat$xvalid, e = dat$evalid,
xvar = xv[i], s = lambda.min,ylab = paste0("f(",xv[i],")"), xlab = xv[i],legend.position = "bottomleft", main="Validate plot") )
try( plotInter(fit_t, x = X[,-c(1,2)], xvar=xv[i], e = e,
s = lambda.min, ylab = paste0("f(",xv[i],")"), xlab = xv[i], legend.position = "bottomleft", main="Full dataset plot") )
}
dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.