# R/LEGIT.R In LEGIT: Latent Environmental & Genetic InTeraction (LEGIT) Model

#### Documented in backward_stepbackward_step_IMbest_modelbest_model.elastic_net_var_selectbootstrap_var_selectelastic_net_var_selectexample_2wayexample_3wayexample_3way_3latentexample_with_crossoverforward_stepforward_step_IMgenetic_var_selectGxE_interaction_RoSGxE_interaction_testIMLEGITIMLEGIT_netIMLEGIT_to_LEGITLEGITLEGIT_to_IMLEGITlongitudinal_foldsnes_var_selectplot.elastic_net_var_selectplot.LEGITpredict.IMLEGITpredict.LEGITr1nes_var_selectstepwise_searchstepwise_search_IMsummary.elastic_net_var_selectsummary.IMLEGITsummary.LEGIT

#' @title Simulated example of a 2 way interaction GxE model.
#' @description Simulated example of a 2 way interaction GxE model (where G and E are latent variables).
#' \deqn{g_j \sim Binomial(n=1,p=.30)}
#' \deqn{j = 1, 2, 3, 4}
#' \deqn{e_l \sim Normal(\mu=0,\sigma=1.5)}
#' \deqn{l = 1, 2, 3}
#' \deqn{g = .2g_1 + .15g_2 - .3g_3 + .1g_4 + .05g_1g_3 + .2g_2g_3}
#' \deqn{e = -.45e_1 + .35e_2 + .2e_3}
#' \deqn{\mu = -1 + 2g + 3e + 4ge}
#' \tabular{cc}{
#' \eqn{y \sim Normal(\mu=\mu,\sigma=\code{sigma})} if \code{logit}=FALSE \cr
#' \eqn{y \sim Binomial(n=1,p=logit(\mu))} if \code{logit}=TRUE
#' }
#' @param N Sample size.
#' @param sigma Standard deviation of the gaussian noise (if \code{logit}=FALSE).
#' @param logit If TRUE, the outcome is transformed to binary with a logit link.
#' @param seed RNG seed.
#' @return Returns a list containing, in the following order: data.frame with the observed outcome (with noise) and the true outcome (without noise), data.frame of the genetic variants (G), data.frame of the environments (E), vector of the true genetic coefficients, vector of the true environmental coefficients, vector of the true main model coefficients
#' @examples
#'	example_2way(5,1,logit=FALSE)
#'	example_2way(5,0,logit=TRUE)
#' @export

example_2way = function(N, sigma=1, logit=FALSE, seed=NULL){
set.seed(seed)
g1 = rbinom(N,1,.30)
g2 = rbinom(N,1,.30)
g3 = rbinom(N,1,.30)
g4 = rbinom(N,1,.30)
g1_g3 = g1*g3
g2_g3 = g2*g3
e1 = rnorm(N,0,1.5)
e2 = rnorm(N,0,1.5)
e3 = rnorm(N,0,1.5)
g = (.2*g1 + .15*g2 - .3*g3 + .1*g4 + .05*g1_g3 + .2*g2_g3)
e = -.45*e1 + .35*e2 + .2*e3
y_true = -1 + 2*g + 3*e + 4*g*e
if (logit){
y_true = 1/(1+exp(-(y_true)))
y = rbinom(N,1,y_true)
}
else{
eps = rnorm(N,0,sigma)
y = y_true + eps
}
return(list(data=data.frame(y,y_true),G=data.frame(g1,g2,g3,g4,g1_g3,g2_g3),E=data.frame(e1,e2,e3),coef_G=c(.2,.15,-.3,.1,.05,.2),coef_E=c(-.45,.35,.2), coef_main=c(-1,2,3,4)))
}

#' @title Simulated example of a 2 way interaction GxE model with crossover point.
#' @description Simulated example of a 2 way interaction GxE model with crossover point (where G and E are latent variables).
#' \deqn{g_j \sim Binomial(n=1,p=.30)}
#' \deqn{j = 1, 2, 3, 4}
#' \deqn{e_l \sim 10 Beta(\alpha,\beta))}
#' \deqn{l = 1, 2, 3}
#' \deqn{g = .30g_1 + .10g_2 + .20g_3 + .40g_4}
#' \deqn{e = .45e_1 + .35e_2 + .2e_3}
#' \deqn{\mu = coef[1] + coef[2]e + coef[3]ge}
#' \tabular{cc}{
#' \eqn{y \sim Normal(\mu=\mu,\sigma=\code{sigma})} if \code{logit}=FALSE \cr
#' \eqn{y \sim Binomial(n=1,p=logit(\mu))} if \code{logit}=TRUE
#' }
#' @param N Sample size.
#' @param sigma Standard deviation of the gaussian noise (if \code{logit}=FALSE).
#' @param c crossover point
#' @param coef_main Coefficients of the main model, must be a vector of size 3 for intercept, E main effect and GxE effect (Default = c(0,1,2)).
#' @param coef_G Coefficients of the 4 genes, must be a vector of size 4 (Default = c(.30, .10, .20, .40)).
#' @param coef_E Coefficients of the 3 environments, must be a vector of size 3 (Default = c(.45, .35, .2)).
#' @param logit If TRUE, the outcome is transformed to binary with a logit link.
#' @param seed RNG seed.
#' @param beta_param Vector of size two for the parameters of the beta distribution of the environmental variables (Default = c(2,2)).
#' @return Returns a list containing, in the following order: data.frame with the observed outcome (with noise) and the true outcome (without noise), data.frame of the genetic variants (G), data.frame of the environments (E), vector of the true genetic coefficients, vector of the true environmental coefficients, vector of the true main model coefficients, the crossover point.
#' @examples
#' ## Examples
#' # Diathesis Stress WEAK
#' ex_dia = example_with_crossover(250, c=10, coef_main = c(3,1,2), sigma=1)
#' # Diathesis Stress STRONG
#' ex_dia_s = example_with_crossover(250, c=10, coef_main = c(3,0,2), sigma=1)
#' # Differential Susceptibility WEAK
#' ex_ds = example_with_crossover(250, c=5, coef_main = c(3+5,1,2), sigma=1)
#' # Differential Susceptibility STRONG
#' ex_ds_s = example_with_crossover(250, c=5, coef_main = c(3+5,0,2), sigma=1)
#' @export

example_with_crossover = function(N, sigma=1, c = 0, coef_main=c(0,1,2), coef_G=c(.30, .10, .20, .40), coef_E=c(.45,.35,.2), logit=FALSE, seed=NULL, beta_param=c(2,2)){
set.seed(seed)
g1 = rbinom(N,1,.30)
g2 = rbinom(N,1,.30)
g3 = rbinom(N,1,.30)
g4 = rbinom(N,1,.30)
e1 = rbeta(N,beta_param[1],beta_param[2])*10
e2 = rbeta(N,beta_param[1],beta_param[2])*10
e3 = rbeta(N,beta_param[1],beta_param[2])*10
g = coef_G[1]*g1 + coef_G[2]*g2 + coef_G[3]*g3 + coef_G[4]*g4
e = coef_E[1]*e1 + coef_E[2]*e2 + coef_E[3]*e3
y_true = coef_main[1] + coef_main[2]*(e-c) + coef_main[3]*g*(e-c)
if (logit){
y_true = 1/(1+exp(-(y_true)))
y = rbinom(N,1,y_true)
}
else{
eps = rnorm(N,0,sigma)
y = y_true + eps
}
return(list(data=data.frame(y,y_true),G=data.frame(g1,g2,g3,g4),E=data.frame(e1,e2,e3),coef_G=coef_G,coef_E=coef_E, coef_main=coef_main, c=c))
}

#' @title Simulated example of a 3 way interaction GxExz model
#' @description Simulated example of a 3 way interaction GxExz model (where G and E are latent variables).
#' \deqn{g_j \sim Binomial(n=1,p=.30)}
#' \deqn{j = 1, 2, 3, 4}
#' \deqn{e_l \sim Normal(\mu=0,\sigma=1.5)}
#' \deqn{l = 1, 2, 3}
#' \deqn{z \sim Normal(\mu=3,\sigma=1)}
#' \deqn{g = .2g_1 + .15g_2 - .3g_3 + .1g_4 + .05g_1g_3 + .2g_2g_3}
#' \deqn{e = -.45e_1 + .35e_2 + .2e_3}
#' \deqn{\mu = -2 + 2g + 3e + z + 5ge - 1.5ez + 2gz + 2gez}
#' \tabular{cc}{
#' \eqn{y \sim Normal(\mu=\mu,\sigma=\code{sigma})} if \code{logit}=FALSE \cr
#' \eqn{y \sim Binomial(n=1,p=logit(\mu))} if \code{logit}=TRUE
#' }
#' @param N Sample size.
#' @param sigma Standard deviation of the gaussian noise (if \code{logit}=FALSE).
#' @param logit If TRUE, the outcome is transformed to binary with a logit link.
#' @param seed RNG seed.
#' @return Returns a list containing, in the following order: data.frame with the observed outcome (with noise), the true outcome (without noise) and \eqn{z}, data.frame of the genetic variants (G), data.frame of the environments (E), vector of the true genetic coefficients, vector of the true environmental coefficients, vector of the true main model coefficients
#' @examples
#'	example_3way(5,2.5,logit=FALSE)
#'	example_3way(5,0,logit=TRUE)
#' @export

example_3way = function(N, sigma=2.5, logit=FALSE, seed=NULL){
set.seed(seed)
g1 = rbinom(N,1,.30)
g2 = rbinom(N,1,.30)
g3 = rbinom(N,1,.30)
g4 = rbinom(N,1,.30)
g1_g3 = g1*g3
g2_g3 = g2*g3
e1 = rnorm(N,0,1.5)
e2 = rnorm(N,0,1.5)
e3 = rnorm(N,0,1.5)
g = (.2*g1 + .15*g2 - .3*g3 + .1*g4 + .05*g1_g3 + .2*g2_g3)
e = -.45*e1 + .35*e2 + .2*e3
z = rnorm(N,3,1)
y_true = -2 + 2*g + 3*e + z + 5*g*e - 1.5*z*e + 2*z*g + 2*z*g*e
if (logit){
y_true = 1/(1+exp(-(y_true)))
y = rbinom(N,1,y_true)
}
else{
eps = rnorm(N,0,sigma)
y = y_true + eps
}
return(list(data=data.frame(y,y_true,z),G=data.frame(g1,g2,g3,g4,g1_g3,g2_g3),E=data.frame(e1,e2,e3),coef_G=c(.2,.15,-.3,.1,.05,.2),coef_E=c(-.45,.35,.2), coef_main=c(-2,2,3,1,5,-1.5,2,2)))
}

#' @title Simulated example of a 3 way interaction GxExZ model
#' @description Simulated example of a 3 way interaction GxExZ model (where G, E and Z are latent variables).
#' \deqn{g_j \sim Binomial(n=1,p=.30)}
#' \deqn{j = 1, 2, 3, 4}
#' \deqn{e_k \sim Normal(\mu=0,\sigma=1.5)}
#' \deqn{k = 1, 2, 3}
#' \deqn{z_l \sim Normal(\mu=3,\sigma=1)}
#' \deqn{l = 1, 2, 3}
#' \deqn{g = .2g_1 + .15g_2 - .3g_3 + .1g_4 + .05g_1g_3 + .2g_2g_3}
#' \deqn{e = -.45e_1 + .35e_2 + .2e_3}
#' \deqn{z = .15z_1 + .60z_2 + .25z_3}
#' \deqn{\mu = -2 + 2g + 3e + z + 5ge - 1.5ez + 2gz + 2gez}
#' \tabular{cc}{
#' \eqn{y \sim Normal(\mu=\mu,\sigma=\code{sigma})} if \code{logit}=FALSE \cr
#' \eqn{y \sim Binomial(n=1,p=logit(\mu))} if \code{logit}=TRUE
#' }
#' @param N Sample size.
#' @param sigma Standard deviation of the gaussian noise (if \code{logit}=FALSE).
#' @param logit If TRUE, the outcome is transformed to binary with a logit link.
#' @param seed RNG seed.
#' @return Returns a list containing, in the following order: data.frame with the observed outcome (with noise) and the true outcome (without noise), list containing the data.frame of the genetic variants (G), the data.frame of the \eqn{e} environments (E) and the data.frame of the \eqn{z} environments (Z), vector of the true genetic coefficients, vector of the true \eqn{e} environmental coefficients, vector of the true \eqn{z} environmental coefficients, vector of the true main model coefficients
#' @examples
#'	example_3way_3latent(5,1,logit=FALSE)
#'	example_3way_3latent(5,0,logit=TRUE)
#' @export

example_3way_3latent = function(N, sigma=1, logit=FALSE, seed=NULL){
set.seed(seed)
g1 = rbinom(N,1,.30)
g2 = rbinom(N,1,.30)
g3 = rbinom(N,1,.30)
g4 = rbinom(N,1,.30)
g1_g3 = g1*g3
g2_g3 = g2*g3
e1 = rnorm(N,0,1.5)
e2 = rnorm(N,0,1.5)
e3 = rnorm(N,0,1.5)
z1 = rnorm(N,3,1)
z2 = rnorm(N,3,1)
z3 = rnorm(N,3,1)
g = (.2*g1 + .15*g2 - .3*g3 + .1*g4 + .05*g1_g3 + .2*g2_g3)
e = -.45*e1 + .35*e2 + .2*e3
z = .15*z1 + .75*z2 + .10*z3
y_true = -2 + 2*g + 3*e + z + 5*g*e - 1.5*z*e + 2*z*g + 2*z*g*e
if (logit){
y_true = 1/(1+exp(-(y_true)))
y = rbinom(N,1,y_true)
}
else{
eps = rnorm(N,0,sigma)
y = y_true + eps
}
return(list(data=data.frame(y,y_true),latent_var=list(G=data.frame(g1,g2,g3,g4,g1_g3,g2_g3),E=data.frame(e1,e2,e3),Z=data.frame(z1,z2,z3)),coef_G=c(.2,.15,-.3,.1,.05,.2),coef_E=c(-.45,.35,.2),coef_Z=c(.15,.75,.10), coef_main=c(-2,2,3,1,5,-1.5,2,2)))
}

#' @title Simulated example of a 3 way interaction GxExZ model
#' @description Simulated example of a 3 way interaction GxExZ model (where G, E and Z are latent variables).
#' \deqn{g_j \sim Binomial(n=1,p=.30)}
#' \deqn{j = 1, 2, 3, 4}
#' \deqn{e_k \sim Normal(\mu=0,\sigma=1.5)}
#' \deqn{k = 1, 2, 3}
#' \deqn{z_l \sim Normal(\mu=3,\sigma=1)}
#' \deqn{l = 1, 2, 3}
#' \deqn{g = .2g_1 + .15g_2 - .3g_3 + .1g_4 + .05g_1g_3 + .2g_2g_3}
#' \deqn{e = -.45e_1 + .35e_2 + .2e_3}
#' \deqn{z = .15z_1 + .60z_2 + .25z_3}
#' \deqn{\mu = -2 + 2g + 3e + z + 5ge - 1.5ez + 2gz + 2gez}
#' \tabular{cc}{
#' \eqn{y \sim Normal(\mu=\mu,\sigma=\code{sigma})} if \code{logit}=FALSE \cr
#' \eqn{y \sim Binomial(n=1,p=logit(\mu))} if \code{logit}=TRUE
#' }
#' @param N Sample size.
#' @param sigma Standard deviation of the gaussian noise (if \code{logit}=FALSE).
#' @param logit If TRUE, the outcome is transformed to binary with a logit link.
#' @param seed RNG seed.
#' @return Returns a list containing, in the following order: data.frame with the observed outcome (with noise) and the true outcome (without noise), list containing the data.frame of the genetic variants (G), the data.frame of the \eqn{e} environments (E) and the data.frame of the \eqn{z} environments (Z), vector of the true genetic coefficients, vector of the true \eqn{e} environmental coefficients, vector of the true \eqn{z} environmental coefficients, vector of the true main model coefficients
#' @examples
#' # Doing only one iteration so its faster
#'	train = example_2way_lme4(250, 1, seed=777)
#'	D = train$data #' G = train$G
#'	E = train$E #' #' F = y ~ G*E #' fit = LEGIT(D, G, E, F, lme4=FALSE, maxiter=1) #' summary(fit) #' F = y ~ 1 #' fit_test = GxE_interaction_test(D, G, E, F, criterion="AIC", lme4=FALSE, maxiter=1) #' fit_test #' #fit_test = GxE_interaction_test(D, G, E, F, criterion="cv", lme4=FALSE, maxiter=1, cv_iter=1) #' #fit_test #' #' F = y ~ G*E + (1|subject) #' fit = LEGIT(D, G, E, F, lme4=TRUE, maxiter=1) #' summary(fit) #' F = y ~ (1|subject) #' fit_test = GxE_interaction_test(D, G, E, F, criterion="AIC", lme4=TRUE, maxiter=1) #' fit_test #' #fit_test = GxE_interaction_test(D, G, E, F, criterion="cv", lme4=TRUE, maxiter=1, cv_iter=1) #' #fit_test #' @export example_2way_lme4 = function (N, sigma = 1, logit = FALSE, seed = NULL) { set.seed(seed) g1 = rbinom(N, 1, 0.3) g2 = rbinom(N, 1, 0.3) g3 = rbinom(N, 1, 0.3) g4 = rbinom(N, 1, 0.3) g1_g3 = g1 * g3 g2_g3 = g2 * g3 e1 = rnorm(N, 0, 1.5) e2 = rnorm(N, 0, 1.5) e3 = rnorm(N, 0, 1.5) g = (0.2 * g1 + 0.15 * g2 - 0.3 * g3 + 0.1 * g4 + 0.05 * g1_g3 + 0.2 * g2_g3) e = -0.45 * e1 + 0.35 * e2 + 0.2 * e3 y_true = -1 + 2 * g + 3 * e + 4 * g * e if (logit) { y_true = 1/(1 + exp(-(y_true))) y = rbinom(N, 1, y_true) y_true2 = 1/(1 + exp(-(y_true + rnorm(N, 0, 2)))) y2 = rbinom(N, 1, y_true2) } else { y = y_true + rnorm(N, 0, sigma) y2 = y + rnorm(N, 0.5, 0.2) } return(list(data = data.frame(y=c(y,y2), time=c(rep(1,N),rep(2,N)), subject=c(1:N,1:N)), G = data.frame(g1=c(g1,g1), g2=c(g2,g2), g3=c(g3,g3), g4=c(g4,g4), g1_g3=c(g1_g3,g1_g3), g2_g3=c(g2_g3,g2_g3)), E = data.frame(e1=c(e1,e1), e2=c(e2,e2), e3=c(e3,e3)), coef_G = c(0.2, 0.15, -0.3, 0.1, 0.05, 0.2), coef_E = c(-0.45, 0.35, 0.2), coef_main = c(-1, 2, 3, 4))) } #' @title Longitudinal folds #' @description Function to create folds adequately for longitudinal datasets by forcing every observation with the same id to be in the same fold. Can be used with LEGIT_cv to make sure that the cross-validation folds are appropriate when using longitudinal data. #' @param cv_iter Number of cross-validation iterations (Default = 1). #' @param cv_folds Number of cross-validation folds (Default = 10). #' @param id Factor vector containing the id number of each observation. #' @param formula Optional Model formula. If data and formula are provided, only the non-missing observations will be used when creating the folds (Put "formula" here if you have missing data). #' @param data Optional data.frame used for the formula. If data and formula are provided, only the non-missing observations will be used when creating the folds (Put "data" here if you have missing data). #' @param data_needed Optional data.frame with variables that have to be included (Put "cbind(genes,env)"" or "latent_var" here if you have missing data). #' @param print If FALSE, nothing except warnings will be printed. (Default = TRUE). #' @return Returns a list of vectors containing the fold number for each observation #' @examples #' train = example_2way(500, 1, seed=777) #' # Assuming it's longitudinal with 4 timepoints, even though it's not #' id = factor(rep(1:125,each=4)) #' fit_cv = LEGIT_cv(train$data, train$G, train$E, y ~ G*E, folds=longitudinal_folds(1,10, id))
#' @export

longitudinal_folds = function(cv_iter=1, cv_folds=10, id, formula=NULL, data=NULL, data_needed=NULL, print=TRUE){
if (cv_folds > length(unique(id))) stop("cv_folds must be smaller than the number of unique id")
# in IMLEGIT, data_needed would be latent_var which is a list and we need to unlist it if that's the case
if (!is.null(data_needed)) if(class(data_needed)=="list") data_needed = do.call(cbind.data.frame, data_needed)
if (!is.null(data) && !is.null(formula)){
# Extracting only the variables available from the formula
formula = as.formula(formula)
formula_full = stats::terms(formula,simplify=TRUE)
formula_outcome = get.vars(formula)[1]
formula_elem_ = attributes(formula_full)$term.labels vars_names = get.vars(formula)[get.vars(formula) %in% names(data)] if (!is.null(data_needed)){ vars_names = c(vars_names,names(data_needed)) data = data.frame(data,data_needed) } vars_names[-length(vars_names)] = paste0(vars_names[-length(vars_names)], " + ") formula_n = paste0(formula_outcome, " ~ ", paste0(vars_names,collapse="")) data = stats::model.frame(formula_n, data, na.action=na.pass) id = id[stats::complete.cases(data)] } else{ if (!is.null(data_needed)) id = id[stats::complete.cases(data_needed)] } if(print) print(paste0("You have ",length(unique(id))," unique observations with ", max(table(factor(id)))," categories/time-points for a total of ", length(id)," observations")) folds = vector("list", cv_iter) for (i in 1:cv_iter){ s = sample(sort(unique(id))) id_new = cut(1:length(s),breaks=cv_folds,labels=FALSE) folds[[i]] = rep(NA, length(id)) for (j in 1:cv_folds){ folds[[i]][id %in% s[id_new==j]] = j } } return(folds) } #' @title Latent Environmental & Genetic InTeraction (LEGIT) model #' @description Constructs a generalized linear model (glm) with a weighted latent environmental score and weighted latent genetic score using alternating optimization. #' @param data data.frame of the dataset to be used. #' @param genes data.frame of the variables inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic). #' @param env data.frame of the variables inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental). #' @param formula Model formula. Use \emph{E} for the environmental score and \emph{G} for the genetic score. Do not manually code interactions, write them in the formula instead (ex: G*E*z or G:E:z). #' @param start_genes Optional starting points for genetic score (must be the same length as the number of columns of \code{genes}). #' @param start_env Optional starting points for environmental score (must be the same length as the number of columns of \code{env}). #' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results). #' @param maxiter Maximum number of iterations. #' @param family Outcome distribution and link function (Default = gaussian). #' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution). #' @param print If FALSE, nothing except warnings will be printed (Default = TRUE). #' @param print_steps If TRUE, print the parameters at all iterations, good for debugging (Default = FALSE). #' @param crossover If not NULL, estimates the crossover point of \emph{E} using the provided value as starting point (To test for diathesis-stress vs differential susceptibility). #' @param crossover_fixed If TRUE, instead of estimating the crossover point of E, we force/fix it to the value of "crossover". (Used when creating a diathes-stress model) (Default = FALSE). #' @param reverse_code If TRUE, after fitting the model, the genes with negative weights are reverse coded (ex: \eqn{g_rev} = 1 - \eqn{g}). It assumes that the original coding is in [0,1]. The purpose of this option is to prevent genes with negative weights which cause interpretation problems (ex: depression normally decreases attention but with a negative genetic score, it increases attention). Warning, using this option with GxG interactions could cause nonsensical results since GxG could be inverted. Also note that this may fail with certain models (Default=FALSE). #' @param rescale If TRUE, the environmental variables are automatically rescaled to the range [-1,1]. This improves interpretability (Default=FALSE). #' @param lme4 If TRUE, uses lme4::lmer or lme4::glmer; Note that is an experimental feature, bugs may arise and certain functions may fail. Currently only summary(), plot(), GxE_interaction_test(), LEGIT(), LEGIT_cv() work. Also note that the AIC and certain elements ignore the existence of the genes and environment variables, thus the AIC may not be used for variable selection of the genes and the environment. However, the AIC can still be used to compare models with the same genes and environments. (Default=FALSE). #' @return Returns an object of the class "LEGIT" which is list containing, in the following order: a glm fit of the main model, a glm fit of the genetic score, a glm fit of the environmental score, a list of the true model parameters (AIC, BIC, rank, df.residual, null.deviance) for which the individual model parts (main, genetic, environmental) don't estimate properly and the formula. #' @examples #' train = example_2way(500, 1, seed=777) #' fit_best = LEGIT(train$data, train$G, train$E, y ~ G*E, train$coef_G, train$coef_E)
#'	fit_default = LEGIT(train$data, train$G, train$E, y ~ G*E) #' summary(fit_default) #' summary(fit_best) #' #' train = example_3way(500, 2.5, seed=777) #' fit_best = LEGIT(train$data, train$G, train$E, y ~ G*E*z, train$coef_G, train$coef_E)
#'	fit_default = LEGIT(train$data, train$G, train$E, y ~ G*E*z) #' summary(fit_default) #' summary(fit_best) #' #' @import formula.tools stats #' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111. #' @export LEGIT = function(data, genes, env, formula, start_genes=NULL, start_env=NULL, eps=.001, maxiter=100, family=gaussian, ylim=NULL, print=TRUE, print_steps=FALSE, crossover = NULL, crossover_fixed = FALSE, reverse_code=FALSE, rescale=FALSE, lme4=FALSE) { if (!is.null(ylim)){ if (!is.numeric(ylim) || length(ylim) !=2) stop("ylim must either be NULL or a numeric vector of size two") } if (maxiter <= 0) warning("maxiter must be > 0") if(!is.null(start_genes)){ if (NCOL(genes)!=length(start_genes)) stop("start_genes must either be NULL or have the same length as the number of genes") } if(!is.null(start_env)){ if (NCOL(env)!=length(start_env)) stop("start_env must either be NULL or have the same length as the number of environments") } if (class(data) != "data.frame" && class(data) != "matrix") stop("data must be a data.frame") # getting right formats # Retaining only the needed variables from the dataset (need to set G and E variables for this to work, they will be replaced with their proper values later) data=data.frame(data) data$G=0
data$E=0 formula = stats::as.formula(formula) # Formula with no random effects formula_norand = gsub("\\s", "", deparse(formula)) formula_norand = gsub("\\+\\(.*)","",formula_norand) formula_norand = gsub("\\(.*)","",formula_norand) formula_norand = as.formula(formula_norand) # Formula with rand effect put as fixed effects, so model.frame works formula_wrand = gsub("(","",formula, fixed=TRUE) formula_wrand = gsub(")","",formula_wrand, fixed=TRUE) formula_wrand = gsub("||","+",formula_wrand, fixed=TRUE) formula_wrand = gsub("|","+",formula_wrand, fixed=TRUE) data = stats::model.frame(formula_wrand, data=data, na.action=na.pass) genes = as.matrix(genes, drop=FALSE) if (is.null(colnames(genes))){ if (print) cat("You have not specified column names for genes, they will be named gene1, gene2, ...\n") colnames(genes) = paste0("gene",1:NCOL(genes)) } env_ = env if (!is.null(crossover) && !crossover_fixed) env = cbind(-1,env) env_ = as.matrix(env_, drop=FALSE) env = as.matrix(env, drop=FALSE) if (is.null(colnames(env))){ if (print) cat("You have not specified column names for env, they will be named env1, env2, ...\n") colnames(env) = paste0("env",1:NCOL(env)) colnames(env_) = paste0("env_",1:NCOL(env_)) } else if (!is.null(crossover) && !crossover_fixed) colnames(env)[1]="crossover" formula = stats::as.formula(formula) # Can only reverse genes in [0,1] if (reverse_code && (min(genes) !=0 || max(genes) !=1)) stop("Please make sure that genes are in range [0,1].") else{ genes_names_orig = colnames(genes) genes_inverted = rep(FALSE, NCOL(genes)) } # Error message about factors if (sum(apply(data,2,is.numeric)) != NCOL(data) || sum(apply(genes,2,is.numeric)) != NCOL(genes) || sum(apply(env,2,is.numeric)) != NCOL(env)) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, gene, env)") # remove missing data comp = stats::complete.cases(data,genes,env) data = data[comp,, drop=FALSE] genes = genes[comp,, drop=FALSE] env = env[comp,, drop=FALSE] env_ = env_[comp,, drop=FALSE] if (dim(data)[1] <= 0) stop("no valid observation without missing values") # Rescale environments if (rescale){ if (!is.null(crossover) && !crossover_fixed) env[,-1] = apply(env[,-1, drop=FALSE], 2, function(x) (x - min(x))*2/(max(x)-min(x))-1) else env = apply(env, 2, function(x) (x - min(x))*2/(max(x)-min(x))-1) env_ = apply(env_, 2, function(x) (x - min(x))*2/(max(x)-min(x))-1) } #Adding empty variables in main dataset for genes and env data[,colnames(genes)]=0 data[,colnames(env)]=0 data$R0_b=0
data$R0_c=0 # Setting up initial weighted scores if (is.null(start_genes)) weights_genes = rep(1/dim(genes)[2],dim(genes)[2]) else if (sum(abs(start_genes))==0) weights_genes = rep(1/dim(genes)[2],dim(genes)[2]) else weights_genes = start_genes/sum(abs(start_genes)) if (is.null(start_env)) weights_env = rep(1/dim(env_)[2],dim(env_)[2]) else if (sum(abs(start_env))==0) weights_env = rep(1/dim(env_)[2],dim(env_)[2]) else weights_env = start_env/sum(abs(start_env)) if (!is.null(crossover) && !crossover_fixed) weights_env = c(crossover, weights_env) data$G = genes%*%weights_genes
if (!is.null(crossover) && crossover_fixed) data$E = env%*%weights_env - crossover else data$E = env%*%weights_env

if (print_steps) cat(paste0("G: ", paste(round(weights_genes,2), collapse = " "), " // E: ", paste(round(weights_env,2), collapse = " "),"\n"))

# Deconstructing formula into parts (No E or G / only E / only G / both G and E)
formula_ = formula_norand
formula_full = stats::terms(formula_,simplify=TRUE)
formula_outcome = get.vars(formula_)[1]
formula_elem_ = attributes(formula_full)$term.labels # Adding white spaces before and after to recognize a "E" as opposed to another string like "Elephant" formula_elem = paste("", formula_elem_,"") index_with_G = grepl(" G ",formula_elem, fixed=TRUE) | grepl(" G:",formula_elem, fixed=TRUE) | grepl(":G:",formula_elem, fixed=TRUE) | grepl(":G ",formula_elem, fixed=TRUE) index_with_E = grepl(" E ",formula_elem, fixed=TRUE) | grepl(" E:",formula_elem, fixed=TRUE) | grepl(":E:",formula_elem, fixed=TRUE) | grepl(":E ",formula_elem, fixed=TRUE) index_with_GE = index_with_G & index_with_E index_with_G = index_with_G & !index_with_GE index_with_E = index_with_E & !index_with_GE data_expanded = stats::model.matrix(formula_, data=data) if (colnames(data_expanded)[1] == "(Intercept)"){ formula_elem = c("1",formula_elem) index_with_G = c(FALSE, index_with_G) index_with_E = c(FALSE, index_with_E) index_with_GE = c(FALSE, index_with_GE) } index_without_GE = !(index_with_G | index_with_E | index_with_GE) ## Formulas for reparametrization in step b (estimating G) formula_elem_withoutG = formula_elem[index_without_GE | index_with_E] formula_elem_withoutG[-length(formula_elem_withoutG)] = paste0(formula_elem_withoutG[-length(formula_elem_withoutG)], " + ") formula_withoutG = paste0(formula_outcome, " ~ ", paste0(formula_elem_withoutG,collapse="")) if (formula_elem[1] != "1") formula_withoutG = paste0(formula_withoutG, " - 1") formula_withoutG = stats::as.formula(formula_withoutG) formula_elem_withG = formula_elem[index_with_G | index_with_GE] # Remove G elements from formula because we want (b1 + b2*E + ...)*G rather than b1*G + b2*E*G + ... formula_elem_withG = gsub(" G ","1",formula_elem_withG, fixed=TRUE) formula_elem_withG = gsub(" G:","",formula_elem_withG, fixed=TRUE) formula_elem_withG = gsub(":G:",":",formula_elem_withG, fixed=TRUE) formula_elem_withG = gsub(":G ","",formula_elem_withG, fixed=TRUE) formula_elem_withG[-length(formula_elem_withG)] = paste0(formula_elem_withG[-length(formula_elem_withG)], " + ") formula_withG = paste0(formula_outcome, " ~ ", paste0(formula_elem_withG,collapse="")) if (sum(grepl("1",formula_elem_withG, fixed=TRUE))==0) formula_withG = paste0(formula_withG, " - 1") formula_withG = stats::as.formula(formula_withG) ## Formulas for reparametrization in step c (estimating E) formula_elem_withoutE = formula_elem[index_without_GE | index_with_G] formula_elem_withoutE[-length(formula_elem_withoutE)] = paste0(formula_elem_withoutE[-length(formula_elem_withoutE)], " + ") formula_withoutE = paste0(formula_outcome, " ~ ", paste0(formula_elem_withoutE,collapse="")) if (formula_elem[1] != "1") formula_withoutE = paste0(formula_withoutE, " - 1") formula_withoutE = stats::as.formula(formula_withoutE) formula_elem_withE = formula_elem[index_with_E | index_with_GE] # Remove E elements from formula because we want (b1 + b2*G + ...)*E rather than b1*E + b2*G*E + ... formula_elem_withE = gsub(" E ","1",formula_elem_withE, fixed=TRUE) formula_elem_withE = gsub(" E:","",formula_elem_withE, fixed=TRUE) formula_elem_withE = gsub(":E:",":",formula_elem_withE, fixed=TRUE) formula_elem_withE = gsub(":E ","",formula_elem_withE, fixed=TRUE) formula_elem_withE[-length(formula_elem_withE)] = paste0(formula_elem_withE[-length(formula_elem_withE)], " + ") formula_withE = paste0(formula_outcome, " ~ ", paste0(formula_elem_withE,collapse="")) if (sum(grepl("1",formula_elem_withE, fixed=TRUE))==0) formula_withE = paste0(formula_withE, " - 1") formula_withE = stats::as.formula(formula_withE) # Making formula for step b (estimating G) genes_names = colnames(genes) genes_names[-length(genes)] = paste0(colnames(genes)[-length(genes)], " + ") formula_b = paste0(formula_outcome, " ~ ", paste0(genes_names,collapse="")) formula_b = paste0(formula_b, " offset(R0_b) - 1") formula_b = stats::as.formula(formula_b) # Making formula for step c (estimating E) env_names = colnames(env) env_names[-length(env)] = paste0(colnames(env)[-length(env)], " + ") formula_c = paste0(formula_outcome, " ~ ", paste0(env_names,collapse="")) formula_c = paste0(formula_c, " offset(R0_c) - 1") formula_c = stats::as.formula(formula_c) weights_genes_old_old = NULL weights_genes_old = NULL for (i in 1:maxiter){ ## Step a : Fit linear mixed effect model if (lme4){ fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE) if (family()$family=="gaussian") fit_a = lme4::lmer(formula, data=data)
else  fit_a = lme4::glmer(formula, data=data, family=family)
rand_effects = predict(fit_a, data, re.form = NULL) - predict(fit_a, data, re.form = NA) # We need to fix this part
fit_a_coef = lme4::fixef(fit_a)
}
else{ # fit main model
fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE)
rand_effects = 0
fit_a_coef = stats::coef(fit_a)
}

if (NCOL(genes)>1){
# Reparametrizing variables for step b (estimating G)
data_expanded_withoutG = stats::model.matrix(formula_withoutG, data=data)
data$R0_b = data_expanded_withoutG%*%fit_a_coef[(index_without_GE | index_with_E)] + rand_effects data_expanded_withG = stats::model.matrix(formula_withG, data=data) R1_b = data_expanded_withG%*%fit_a_coef[(index_with_G | index_with_GE)] R1_b_genes = genes*as.vector(R1_b) data[,colnames(genes)]=R1_b_genes ## Step b : fit model for G fit_b = stats::glm(formula_b, data=data, family=family, y=FALSE, model=FALSE) weights_genes_ = stats::coef(fit_b) # Reverse coding if (reverse_code && sum(weights_genes_ < 0) > 0){ weights_genes_old_old = weights_genes_old # Invert gene coding genes[,weights_genes_ < 0] = 1 - genes[,weights_genes_ < 0] genes_inverted[weights_genes_ < 0] = !genes_inverted[weights_genes_ < 0] # changing names for user to know they were inverted colnames(genes)[(weights_genes_ < 0) & genes_inverted] = paste0(genes_names_orig[weights_genes_ < 0 & genes_inverted],"_inverted") colnames(genes)[(weights_genes_ < 0) & !genes_inverted] = genes_names_orig[weights_genes_ < 0 & !genes_inverted] # Remaking formula for step b (estimating G) genes_names = colnames(genes) genes_names[-length(genes)] = paste0(colnames(genes)[-length(genes)], " + ") formula_b = paste0(formula_outcome, " ~ ", paste0(genes_names,collapse="")) formula_b = paste0(formula_b, " offset(R0_b) - 1") formula_b = stats::as.formula(formula_b) # Refit R1_b_genes = genes*as.vector(R1_b) data[,colnames(genes)]=R1_b_genes fit_b = stats::glm(formula_b, data=data, family=family, y=FALSE, model=FALSE) weights_genes_ = stats::coef(fit_b) } # Updating G estimates and checking convergence weights_genes_old = weights_genes weights_genes = weights_genes_/sum(abs(weights_genes_)) if (!is.null(weights_genes_old_old)){ if(sqrt(sum((weights_genes_old_old-weights_genes)^2)) < eps){ warning("Can't reverse code properly, returning the model with reverse_code=FALSE.") return(LEGIT(data=data, genes=genes, env=env, formula=formula, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=print, print_steps=print_steps, crossover = crossover, crossover_fixed = crossover_fixed, reverse_code=FALSE)) } } data$G = genes%*%weights_genes

if(sqrt(sum((weights_genes_old-weights_genes)^2)) < eps) conv_G = TRUE
else conv_G = FALSE
}
else conv_G = TRUE

if (NCOL(env)>1){
# Reparametrizing variables for step c (estimating E)
data_expanded_withoutE = stats::model.matrix(formula_withoutE, data=data)
data$R0_c = data_expanded_withoutE%*%fit_a_coef[(index_without_GE | index_with_G)] + rand_effects data_expanded_withE = stats::model.matrix(formula_withE, data=data) R1_c = data_expanded_withE%*%fit_a_coef[(index_with_E | index_with_GE)] R1_c_env = env*as.vector(R1_c) data[,colnames(env)]=R1_c_env if (!is.null(crossover) && crossover_fixed) data$R0_c = data$R0_c - crossover*R1_c ## Step c : fit model for E fit_c = stats::glm(formula_c, data=data, family=family, y=FALSE, model=FALSE) weights_env_ = stats::coef(fit_c) # Updating E estimates and checking convergence weights_env_old = weights_env if (!is.null(crossover) && !crossover_fixed) weights_env[-1] = weights_env_[-1]/sum(abs(weights_env_[-1])) else weights_env = weights_env_/sum(abs(weights_env_)) if (!is.null(crossover) && crossover_fixed) data$E = env%*%weights_env - crossover
else data$E = env%*%weights_env if(sqrt(sum((weights_env_old-weights_env)^2)) < eps) conv_E = TRUE else conv_E = FALSE } else conv_E = TRUE if (print_steps && !is.null(crossover) && crossover_fixed) cat(paste0("Main: ", paste(round(coef(fit_a),2), collapse = " "), " // G: ", paste(round(weights_genes,2), collapse = " "), " // E: ", paste(round(weights_env,2), collapse = " "), " // C: ", round(crossover,2),"\n")) else if (print_steps && !is.null(crossover) && !crossover_fixed) cat(paste0("Main: ", paste(round(coef(fit_a),2), collapse = " "), " // G: ", paste(round(weights_genes,2), collapse = " "), " // E: ", paste(round(weights_env[-1],2), collapse = " "), " // C: ", round(weights_env[1],2),"\n")) else if (print_steps) cat(paste0("Main: ", paste(round(coef(fit_a),2), collapse = " "), " // G: ", paste(round(weights_genes,2), collapse = " "), " // E: ", paste(round(weights_env,2), collapse = " "),"\n")) if (conv_G & conv_E) break } if (print_steps) cat("\n") # Rerunning last time and scaling to return as results if (lme4){ fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE) if (family()$family=="gaussian") fit_a = lme4::lmer(formula, data=data)
else  fit_a = lme4::glmer(formula, data=data, family=family)
rand_effects = predict(fit_a, data, re.form = NULL) - predict(fit_a, data, re.form = NA) # We need to fix this part
fit_a_coef = lme4::fixef(fit_a)
}
else{ # fit main model
fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE)
rand_effects = 0
fit_a_coef = stats::coef(fit_a)
}

# Reparametrizing variables for step b (estimating G)
data_expanded_withoutG = stats::model.matrix(formula_withoutG, data=data)
data$R0_b = data_expanded_withoutG%*%fit_a_coef[(index_without_GE | index_with_E)] + rand_effects data_expanded_withG = stats::model.matrix(formula_withG, data=data) R1_b = data_expanded_withG%*%fit_a_coef[(index_with_G | index_with_GE)] R1_b_genes = genes*as.vector(R1_b) data[,colnames(genes)]=R1_b_genes fit_b = stats::glm(formula_b, data=data, family=family, y=FALSE, model=FALSE) data[,colnames(genes)] = data[,colnames(genes)]*sum(abs(stats::coef(fit_b))) fit_b = stats::glm(formula_b, data=data, family=family, y=FALSE, model=FALSE) # Reparametrizing variables for step c (estimating E) data_expanded_withoutE = stats::model.matrix(formula_withoutE, data=data) data$R0_c = data_expanded_withoutE%*%fit_a_coef[(index_without_GE | index_with_G)] + rand_effects
data_expanded_withE = stats::model.matrix(formula_withE, data=data)
R1_c = data_expanded_withE%*%fit_a_coef[(index_with_E | index_with_GE)]
R1_c_env = env*as.vector(R1_c)
data[,colnames(env)]=R1_c_env
if (!is.null(crossover) && crossover_fixed) data$R0_c = data$R0_c - crossover*R1_c

fit_c = stats::glm(formula_c, data=data, family=family, y=FALSE, model=FALSE)
if (!is.null(crossover) && !crossover_fixed) data[,colnames(env)] = data[,colnames(env)]*sum(abs(stats::coef(fit_c)[-1]))
else data[,colnames(env)] = data[,colnames(env)]*sum(abs(stats::coef(fit_c)))
fit_c = stats::glm(formula_c, data=data, family=family, y=FALSE, model=FALSE)

if (!is.null(crossover) && !crossover_fixed) crossover = as.numeric(coef(fit_c)[1])

# Make sure data make sense for user (and for plot function)
if (!lme4){ # cannot change a lme4 data sadly
fit_a$data[,colnames(env)] = env fit_a$data[,colnames(genes)] = genes
}
fit_b$data[,colnames(genes)] = genes fit_b$data = cbind(fit_b$data, data[,!(names(data) %in% names(fit_b$data))])
fit_c$data[,colnames(env)] = env fit_c$data = cbind(fit_c$data, data[,!(names(data) %in% names(fit_c$data))])
weights_genes = stats::coef(fit_b)
if (!lme4) fit_a$data$G = genes%*%weights_genes # cannot change a lme4 data sadly
fit_b$data$G = genes%*%weights_genes
fit_c$data$G = genes%*%weights_genes
if (!is.null(crossover) && !crossover_fixed) weights_env = stats::coef(fit_c)[-1]
else weights_env = stats::coef(fit_c)
if (is.null(crossover)) crossover_ = 0
else crossover_ = crossover
if (!lme4) fit_a$data$E = env_%*%weights_env - crossover_ # cannot change a lme4 data sadly
#fit_b$data$E = env_%*%weights_env - crossover_
#fit_c$data$E = env_%*%weights_env - crossover_

if (!lme4){
if (!(abs((fit_a$deviance-fit_b$deviance)/fit_a$deviance))<.01 && abs(((fit_a$deviance-fit_c$deviance)/fit_c$deviance))<.01) warning("Deviance differs by more than 1% between model parts. Make sure that everything was set up properly and try increasing the number of iterations (maxiter).")
}

#Change some arguments so that we get the right AIC, BIC and dispersion for the model
# I don't agree but we need to follow the same guideline to make sure it matches with glm()
logLik_df =  attr(logLik(fit_a), "df") + (fit_b$rank - 1) + (fit_c$rank - 1)

if (is.na(logLik(fit_a))){
# Quasi-GLM stuff
get_dispersion <- function(object) {
with(object,sum((weights * residuals^2)[weights > 0])/df.residual)
}
get_loglik <- function(object) {
-object$deviance / 2 } log_lik = get_loglik(fit_a) c_hat = get_dispersion(fit_a) } else{ log_lik = logLik(fit_a)[1] c_hat = 1 } true_aic = 2*logLik_df - (2*log_lik/c_hat) true_aicc = true_aic + ((2*logLik_df*(logLik_df+1))/(stats::nobs(fit_a)-logLik_df-1)) true_bic = log(stats::nobs(fit_a))*logLik_df - (2*log_lik/c_hat) if (!lme4){ true_rank = fit_a$rank + (fit_b$rank - 1) + (fit_c$rank - 1)
# true_rank might be different from df of logLik, this is because glm() assume that variance components should be counted
true_df.residual = stats::nobs(fit_a) - true_rank
true_null.deviance = fit_a$null.deviance lme4 = FALSE } else{ true_rank = length(lme4::fixef(fit_a)) + (fit_b$rank - 1) + (fit_c$rank - 1) true_df.residual = stats::nobs(fit_a) - true_rank fit_a_null = stats::glm(formula_norand, data=data, family=family, y=FALSE, model=FALSE) true_null.deviance = fit_a_null$null.deviance
lme4 = TRUE
}
# print convergences stuff;
conv = (conv_G & conv_E)
if (conv_G & conv_E){
if (print) cat(paste0("Converged in ",i, " iterations\n"))
}
else{
warning(paste0("Did not reach convergence in maxiter iterations. Try increasing maxiter or make eps smaller."))
if (!is.null(crossover) && !crossover_fixed) warning(paste0("The model is not garanteed to converge when a crossover point is estimated. Try different starting points for the crossover point."))
}
if (is.null(crossover)) crossover_fixed=NULL
result = list(fit_main = fit_a, fit_genes = fit_b, fit_env = fit_c, true_model_parameters=list(AIC = true_aic, AICc = true_aicc, BIC = true_bic, rank = true_rank, df.residual = true_df.residual, null.deviance=true_null.deviance, lme4=lme4), ylim=ylim, formula=formula, crossover=crossover, crossover_fixed=crossover_fixed, conv=conv)
class(result) <- "LEGIT"
return(result)
}

#' @title Testing of the GxE interaction
#' @description Testing of the GxE interaction using the competitive-confirmatory approach adapted from Belsky, Pluess et Widaman (2013). Reports the different hypotheses (diathesis-stress, vantage-sensitivity, or differential susceptibility), assuming or not assuming a main effect for \emph{E} (WEAK vs STRONG) using the LEGIT model.
#' @param data data.frame of the dataset to be used.
#' @param genes data.frame of the variables inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic).
#' @param env data.frame of the variables inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental).
#' @param formula_noGxE formula WITHOUT \emph{G} or \emph{E} (y ~ covariates). \emph{G} and \emph{E} will automatically be added properly based on the hypotheses tested.
#' @param crossover A tuple containting the minimum and maximum of the environment used as crossover point of \emph{E} used in the vantage sensitivity and diathesis-stress models. Instead of providing two number, you can also write c("min","max") to automatically choose the expected minimum or maximum of the environmental score which is calculated based on the min/max of the environments and the current weights.
#' @param include_noGxE_models If True, we test for models with only G, only E, both G and E, neither G and E (four models without a GxE). This is to verify for false positives, if one of those models has the best fit, then it is possible that there is no GxE, thus no type of GxE. With a single gene and environment, simply looking at the p-value of the GxE is good enough to get around 5-10 percent false positive rate, but with multiple genes and environments, we need to compare model fits to get a low false positive rate. Use your own judgment when using this because if you have multiple genes and environments and small/moderate N, a model without GxE could have a lower BIC but still not be the actual best model. However, if you see little difference in BIC between all 4 GxE models and the non-GxE models have much lower BIC, than it is likely that there is no GxE. Note that this is only implemented for AIC, AICc and BIC. (Default = True)
#' @param reverse_code If TRUE, after fitting the model, the genes with negative weights are reverse coded (ex: \eqn{g_{rev}} = 1 - \eqn{g}). It assumes that the original coding is in [0,1]. The purpose of this option is to prevent genes with negative weights which cause interpretation problems (ex: depression normally decreases attention but with a negative genetic score, it increases attention). Warning, using this option with GxG interactions could cause nonsensical results since GxG could be inverted. Also note that this may fail with certain models (Default=FALSE).
#' @param rescale If TRUE, the environmental variables are automatically rescaled to the range [-1,1]. This improves interpretability (Default=FALSE).
#' @param boot Optional number of bootstrap samples. If not NULL, we use bootstrap to find the confidence interval of the crossover point. This provides more realistic confidence intervals. Make sure to use a bigger number (>= 1000) to get good precision; also note that a too small number could return an error ("estimated adjustment 'a' is NA").
#' @param criterion Criterion used to assess which model is the best. It can be set to "AIC", "AICc", "BIC", "cv", "cv_AUC", "cv_Huber" (Default="BIC").
#' @param start_genes Optional starting points for genetic score (must be the same length as the number of columns of \code{genes}).
#' @param start_env Optional starting points for environmental score (must be the same length as the number of columns of \code{env}).
#' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results).
#' @param maxiter Maximum number of iterations.
#' @param family Outcome distribution and link function (Default = gaussian).
#' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution).
#' @param cv_iter Number of cross-validation iterations (Default = 5).
#' @param cv_folds Number of cross-validation folds (Default = 10). Using \code{cv_folds=NROW(data)} will lead to leave-one-out cross-validation.
#' @param folds Optional list of vectors containing the fold number for each observation. Bypass cv_iter and cv_folds. Setting your own folds could be important for certain data types like time series or longitudinal data.
#' @param id Optional id of observations, can be a vector or data.frame (only used when returning list of possible outliers).
#' @param classification Set to TRUE if you are doing classification (binary outcome).
#' @param seed Seed for cross-validation folds.
#' @param Huber_p Parameter controlling the Huber cross-validation error (Default = 1.345).
#' @param test_only If TRUE, only uses the first fold for training and predict the others folds; do not train on the other folds. So instead of cross-validation, this gives you train/test and you get the test R-squared as output.
#' @param lme4 If TRUE, uses lme4::lmer or lme4::glmer; Note that is an experimental feature, bugs may arise and certain functions may fail. Currently only summary(), plot(), GxE_interaction_test(), LEGIT(), LEGIT_cv() work. Also note that the AIC and certain elements ignore the existence of the genes and environment variables, thus the AIC may not be used for variable selection of the genes and the environment. However, the AIC can still be used to compare models with the same genes and environments. (Default=FALSE).
#' @return Returns a list containing 1) the six models ordered from best to worse (vantage sensitivity WEAK/STRONG, diathesis-stress WEAK/STRONG, differential susceptibility WEAK/STRONG) and 2) a data frame with the criterion, the crossover, 95\% coverage of the crossover, whether the crossover 95\% interval is within the observable range and the percentage of observations below the crossover point in order from best to worst based on the selected criterion. Models not within the observable range should be rejected even if the criterion is slightly better. An extremely low percentage of observations below the crossover point is also evidence toward diathesis-stress. Note that we assume that the environmental score is from bad to good but if this is not the case, then the models labelled as "diathesis-stress" could actually reflect vantage sensitivity and vice-versa. If outcome is Good-to-Bad: C=min(E) is diathesis-stress, C=max(E) is vantage sensitivity. If outcome is Bad-to-Good: C=max(E) is diathesis-stress, C=min(E) is vantage sensitivity.
#' @examples
#' \dontrun{
#' ## Examples where x is in [0, 10]
#' # Diathesis Stress WEAK
#' ex_dia = example_with_crossover(250, c=10, coef_main = c(3,1,2), sigma=1)
#' # Diathesis Stress STRONG
#' ex_dia_s = example_with_crossover(250, c=10, coef_main = c(3,0,2), sigma=1)
#' ## Assuming there is a crossover point at x=5
#' # Differential Susceptibility WEAK
#' ex_ds = example_with_crossover(250, c=5, coef_main = c(3+5,1,2), sigma=1)
#' # Differential Susceptibility STRONG
#' ex_ds_s = example_with_crossover(250, c=5, coef_main = c(3+5,0,2), sigma=1)
#'
#' ## If true model is "Diathesis Stress WEAK"
#' GxE_test_BIC = GxE_interaction_test(ex_dia$data, ex_dia$G, ex_dia$E, #' formula_noGxE = y ~ 1, start_genes = ex_dia$coef_G, start_env = ex_dia$coef_E, #' criterion="BIC") #' GxE_test_BIC$results
#'
#' ## If true model is "Diathesis Stress STRONG"
#' GxE_test_BIC = GxE_interaction_test(ex_dia_s$data, ex_dia_s$G, ex_dia_s$E, #' formula_noGxE = y ~ 1, start_genes = ex_dia_s$coef_G, start_env = ex_dia_s$coef_E, #' criterion="BIC") #' GxE_test_BIC$results
#'
#' ## If true model is "Differential susceptibility WEAK"
#' GxE_test_BIC = GxE_interaction_test(ex_ds$data, ex_ds$G, ex_ds$E, #' formula_noGxE = y ~ 1, start_genes = ex_ds$coef_G, start_env = ex_ds$coef_E, #' criterion="BIC") #' GxE_test_BIC$results
#'
#' ## If true model is "Differential susceptibility STRONG"
#' GxE_test_BIC = GxE_interaction_test(ex_ds_s$data, ex_ds_s$G, ex_ds_s$E, #' formula_noGxE = y ~ 1, start_genes = ex_ds_s$coef_G, start_env = ex_ds_s$coef_E, #' criterion="BIC") #' GxE_test_BIC$results
#'
#' # Example of plots
#' plot(GxE_test_BIC$fits$diff_suscept_STRONG, xlim=c(0,10), ylim=c(3,13))
#' plot(GxE_test_BIC$fits$diff_suscept_WEAK, xlim=c(0,10), ylim=c(3,13))
#' plot(GxE_test_BIC$fits$diathesis_stress_STRONG, xlim=c(0,10), ylim=c(3,13))
#' plot(GxE_test_BIC$fits$diathesis_stress_WEAK, xlim=c(0,10), ylim=c(3,13))
#' }
#' @import formula.tools stats
#' @references Alexia Jolicoeur-Martineau, Jay Belsky, Eszter Szekely, Keith F. Widaman, Michael Pluess, Celia Greenwood and Ashley Wazana. \emph{Distinguishing differential susceptibility, diathesis-stress and vantage sensitivity: beyond the single gene and environment model} (2017). psyarxiv.com/27uw8. 10.17605/OSF.IO/27UW8.
#' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111.
#' @references Jay Belsky, Michael Pluess and Keith F. Widaman. \emph{Confirmatory and competitive evaluation of alternative gene-environment interaction hypotheses} (2013). Journal of Child Psychology and Psychiatry, 54(10), 1135-1143.
#' @export

GxE_interaction_test = function(data, genes, env, formula_noGxE, crossover=c("min","max"), include_noGxE_models = TRUE, reverse_code=FALSE, rescale = FALSE, boot = NULL, criterion="BIC", start_genes=NULL, start_env=NULL, eps=.001, maxiter=100, family=gaussian, ylim=NULL, cv_iter=5, cv_folds=10, folds=NULL, Huber_p=1.345, id=NULL, classification=FALSE, seed=NULL, test_only=FALSE, lme4=FALSE)
{
formula = stats::as.formula(formula_noGxE)
formula_WEAK = paste0(formula, " + G*E - G")
formula_STRONG = paste0(formula, " + G*E - G - E")
formula_WEAK_nocrossover = paste0(formula, " + G*E")
formula_STRONG_nocrossover = paste0(formula, " + G*E - E")

if (include_noGxE_models){
if (criterion == "cv" || criterion == "cv_AUC" || criterion=="cv_Huber" || criterion=="cv_L1") warning("When using include_noGxE_models=TRUE, the criterion must be one of the following: AIC, AICc, or BIC.")

genes_formula = paste0(" + ", colnames(genes))
env_formula = paste0(" + ", colnames(env))
formula_model_E_only = paste0(formula, env_formula)
formula_model_intercept_only = formula
formula_model_GandE_only = paste0(formula, genes_formula, env_formula)
formula_model_G_only = paste0(formula, genes_formula)

model_E_only = glm(data=cbind(data, genes, env), formula=formula_model_E_only, family=family)
model_intercept_only = glm(data=cbind(data, genes, env), formula=formula_model_intercept_only, family=family)
model_GandE_only = glm(data=cbind(data, genes, env), formula=formula_model_GandE_only, family=family)
model_G_only = glm(data=cbind(data, genes, env), formula=formula_model_G_only, family=family)
}

# 4 Models
# If min or max, then we must find the min or max E and make it the crossover after
# We also use the G and E weights too as starting point for even more stability.
crossover_vantage_sensitivity_WEAK = crossover[1]
crossover_vantage_sensitivity_STRONG = crossover[1]
crossover_diathesis_stress_WEAK = crossover[2]
crossover_diathesis_stress_STRONG = crossover[2]

# Need to take min if coef > 0 and max if coef < 0 to get lower bound
# Need to take max if coef > 0 and min if coef < 0 to get upper bound
get_LU = function(fit){
coef_env = coef(fit$fit_env) env_lower_bound = rep(0, NCOL(env)) env_upper_bound = rep(0, NCOL(env)) for (i in 1:NCOL(env)){ if (coef_env[i] >= 0){ env_lower_bound[i] = min(fit$fit_env$data[,names(env)[i]]) env_upper_bound[i] = max(fit$fit_env$data[,names(env)[i]]) } else{ env_lower_bound[i] = max(fit$fit_env$data[,names(env)[i]]) env_upper_bound[i] = min(fit$fit_env$data[,names(env)[i]]) } } return(list(L=env_lower_bound, U=env_upper_bound)) } # Vantage sensitivity c = min if (crossover[1] == "min"){ vantage_sensitivity_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (crossover[1] == "min") crossover_vantage_sensitivity_WEAK = as.numeric(get_LU(vantage_sensitivity_WEAK)$L %*% coef(vantage_sensitivity_WEAK$fit_env)) vantage_sensitivity_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=coef(vantage_sensitivity_WEAK$fit_genes), start_env=coef(vantage_sensitivity_WEAK$fit_env), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover_vantage_sensitivity_WEAK, crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) vantage_sensitivity_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (crossover[1] == "min") crossover_vantage_sensitivity_STRONG = as.numeric(get_LU(vantage_sensitivity_STRONG)$L %*% coef(vantage_sensitivity_STRONG$fit_env)) vantage_sensitivity_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=coef(vantage_sensitivity_STRONG$fit_genes), start_env=coef(vantage_sensitivity_STRONG$fit_env), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover_vantage_sensitivity_STRONG, crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) } else{ vantage_sensitivity_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover[1], crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) vantage_sensitivity_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover[1], crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) } # Diathesis-Stress c = max if (crossover[2] == "max"){ diathesis_stress_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (crossover[2] == "max") crossover_diathesis_stress_WEAK = as.numeric(get_LU(diathesis_stress_WEAK)$U %*% coef(diathesis_stress_WEAK$fit_env)) diathesis_stress_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=coef(diathesis_stress_WEAK$fit_genes), start_env=coef(diathesis_stress_WEAK$fit_env), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover_diathesis_stress_WEAK, crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) diathesis_stress_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (crossover[2] == "max") crossover_diathesis_stress_STRONG = as.numeric(get_LU(diathesis_stress_STRONG)$U %*% coef(diathesis_stress_STRONG$fit_env)) diathesis_stress_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=coef(diathesis_stress_STRONG$fit_genes), start_env=coef(diathesis_stress_STRONG$fit_env), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover_diathesis_stress_STRONG, crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) } else{ diathesis_stress_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover[2], crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) diathesis_stress_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover[2], crossover_fixed = TRUE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) } # We first run the models as normal LEGIT models without crossover, then we use C=-beta_G/beta_GxE as the crossover starting point. # We also use the G and E weights too as starting point for even more stability. diff_suscept_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK_nocrossover, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (lme4) crossover_diff_suscept_WEAK = -lme4::fixef(diff_suscept_WEAK$fit_main)[2]/lme4::fixef(diff_suscept_WEAK$fit_main)[4] else crossover_diff_suscept_WEAK = -coef(diff_suscept_WEAK$fit_main)[2]/coef(diff_suscept_WEAK$fit_main)[4] diff_suscept_WEAK = LEGIT(data=data, genes=genes, env=env, formula=formula_WEAK, start_genes=coef(diff_suscept_WEAK$fit_genes), start_env=coef(diff_suscept_WEAK$fit_env), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover_diff_suscept_WEAK, crossover_fixed = FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) diff_suscept_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG_nocrossover, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (lme4) crossover_diff_suscept_STRONG = -lme4::fixef(diff_suscept_STRONG$fit_main)[2]/lme4::fixef(diff_suscept_STRONG$fit_main)[3] else crossover_diff_suscept_STRONG = -coef(diff_suscept_STRONG$fit_main)[2]/coef(diff_suscept_STRONG$fit_main)[3] diff_suscept_STRONG = LEGIT(data=data, genes=genes, env=env, formula=formula_STRONG, start_genes=coef(diff_suscept_STRONG$fit_genes), start_env=coef(diff_suscept_STRONG$fit_env), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, crossover = crossover_diff_suscept_STRONG, crossover_fixed = FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) # Criterions if (criterion == "cv" || criterion == "cv_AUC" || criterion=="cv_Huber" || criterion=="cv_L1"){ if (crossover[1] == "min"){ vantage_sensitivity_WEAK_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_WEAK, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover_vantage_sensitivity_WEAK, crossover_fixed = TRUE, test_only = test_only, lme4=lme4) vantage_sensitivity_STRONG_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_STRONG, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover_vantage_sensitivity_STRONG, crossover_fixed = TRUE, test_only = test_only, lme4=lme4) } else{ vantage_sensitivity_WEAK_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_WEAK, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover[1], crossover_fixed = TRUE, test_only = test_only, lme4=lme4) vantage_sensitivity_STRONG_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_STRONG, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover[1], crossover_fixed = TRUE, test_only = test_only, lme4=lme4) } if (crossover[2] == "max"){ diathesis_stress_WEAK_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_WEAK, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover_diathesis_stress_WEAK, crossover_fixed = TRUE, test_only = test_only, lme4=lme4) diathesis_stress_STRONG_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_STRONG, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover_diathesis_stress_STRONG, crossover_fixed = TRUE, test_only = test_only, lme4=lme4) } else{ diathesis_stress_WEAK_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_WEAK, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover[2], crossover_fixed = TRUE, test_only = test_only, lme4=lme4) diathesis_stress_STRONG_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_STRONG, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover[2], crossover_fixed = TRUE, test_only = test_only, lme4=lme4) } diff_suscept_WEAK_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_WEAK, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover_diff_suscept_WEAK, crossover_fixed = FALSE, test_only = test_only, lme4=lme4) diff_suscept_STRONG_cv = LEGIT_cv(data=data, genes=genes, env=env, formula=formula_STRONG, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, seed=seed, id=id, crossover = crossover_diff_suscept_STRONG, crossover_fixed = FALSE, test_only = test_only, lme4=lme4) if (criterion == "cv") model_criterion = c(mean(vantage_sensitivity_WEAK_cv$R2_cv), mean(vantage_sensitivity_STRONG_cv$R2_cv),mean(diathesis_stress_WEAK_cv$R2_cv), mean(diathesis_stress_STRONG_cv$R2_cv), mean(diff_suscept_WEAK_cv$R2_cv), mean(diff_suscept_STRONG_cv$R2_cv)) else if (criterion == "cv_Huber") model_criterion = c(mean(vantage_sensitivity_WEAK_cv$Huber_cv), mean(vantage_sensitivity_STRONG_cv$Huber_cv),mean(diathesis_stress_WEAK_cv$Huber_cv), mean(diathesis_stress_STRONG_cv$Huber_cv), mean(diff_suscept_WEAK_cv$Huber_cv), mean(diff_suscept_STRONG_cv$Huber_cv)) else if (criterion == "cv_L1") model_criterion = c(mean(vantage_sensitivity_WEAK_cv$L1_cv), mean(vantage_sensitivity_STRONG_cv$L1_cv),mean(diathesis_stress_WEAK_cv$L1_cv), mean(diathesis_stress_STRONG_cv$L1_cv), mean(diff_suscept_WEAK_cv$L1_cv), mean(diff_suscept_STRONG_cv$L1_cv)) else if (criterion == "cv_AUC") model_criterion = c(mean(vantage_sensitivity_WEAK_cv$AUC), mean(vantage_sensitivity_STRONG_cv$AUC),mean(diathesis_stress_WEAK_cv$AUC), mean(diathesis_stress_STRONG_cv$AUC), mean(diff_suscept_WEAK_cv$AUC), mean(diff_suscept_STRONG_cv$AUC)) # List in order from best to worse ordering = order(model_criterion, decreasing = TRUE) } else{ if (criterion == "AIC") model_criterion = c(vantage_sensitivity_WEAK$true_model_parameters$AIC, vantage_sensitivity_STRONG$true_model_parameters$AIC,diathesis_stress_WEAK$true_model_parameters$AIC, diathesis_stress_STRONG$true_model_parameters$AIC, diff_suscept_WEAK$true_model_parameters$AIC, diff_suscept_STRONG$true_model_parameters$AIC) else if (criterion == "AICc") model_criterion = c(vantage_sensitivity_WEAK$true_model_parameters$AICc, vantage_sensitivity_STRONG$true_model_parameters$AICc,diathesis_stress_WEAK$true_model_parameters$AICc, diathesis_stress_STRONG$true_model_parameters$AICc, diff_suscept_WEAK$true_model_parameters$AICc, diff_suscept_STRONG$true_model_parameters$AICc) else if (criterion == "BIC") model_criterion = c(vantage_sensitivity_WEAK$true_model_parameters$BIC, vantage_sensitivity_STRONG$true_model_parameters$BIC,diathesis_stress_WEAK$true_model_parameters$BIC, diathesis_stress_STRONG$true_model_parameters$BIC, diff_suscept_WEAK$true_model_parameters$BIC, diff_suscept_STRONG$true_model_parameters$BIC) if (include_noGxE_models){ logLik_df1 = attr(logLik(model_E_only), "df") logLik_df2 = attr(logLik(model_intercept_only), "df") logLik_df3 = attr(logLik(model_GandE_only), "df") logLik_df4 = attr(logLik(model_G_only), "df") if (criterion == "AIC") model_criterion = c(model_criterion, 2*logLik_df1 - 2*logLik(model_E_only)[1], 2*logLik_df2 - 2*logLik(model_intercept_only)[1], 2*logLik_df3 - 2*logLik(model_GandE_only)[1], 2*logLik_df4 - 2*logLik(model_G_only)[1]) else if (criterion == "AICc") model_criterion = c(model_criterion, (2*logLik_df1 - 2*logLik(model_E_only)[1]) + ((2*logLik_df1*(logLik_df1+1))/(stats::nobs(model_E_only)-logLik_df1-1)), (2*logLik_df2 - 2*logLik(model_intercept_only)[1]) + ((2*logLik_df2*(logLik_df2+1))/(stats::nobs(model_intercept_only)-logLik_df2-1)), (2*logLik_df3 - 2*logLik(model_GandE_only)[1]) + ((2*logLik_df3*(logLik_df3+1))/(stats::nobs(model_GandE_only)-logLik_df3-1)),(2*logLik_df4 - 2*logLik(model_G_only)[1]) + ((2*logLik_df4*(logLik_df4+1))/(stats::nobs(model_G_only)-logLik_df4-1))) else if (criterion == "BIC") model_criterion = c(model_criterion, log(stats::nobs(model_E_only))*logLik_df1 - 2*logLik(model_E_only)[1], log(stats::nobs(model_intercept_only))*logLik_df2 - 2*logLik(model_intercept_only)[1], log(stats::nobs(model_GandE_only))*logLik_df3 - 2*logLik(model_GandE_only)[1], log(stats::nobs(model_G_only))*logLik_df4 - 2*logLik(model_G_only)[1]) } # List in order from best to worse ordering = order(model_criterion) } model_criterion = round(model_criterion[ordering],2) if (include_noGxE_models) crossover_models = round(c(vantage_sensitivity_WEAK$crossover, vantage_sensitivity_STRONG$crossover, diathesis_stress_WEAK$crossover, diathesis_stress_STRONG$crossover, diff_suscept_WEAK$crossover, diff_suscept_STRONG$crossover, NA, NA, NA, NA)[ordering],2) else crossover_models = round(c(vantage_sensitivity_WEAK$crossover, vantage_sensitivity_STRONG$crossover, diathesis_stress_WEAK$crossover, diathesis_stress_STRONG$crossover, diff_suscept_WEAK$crossover, diff_suscept_STRONG$crossover)[ordering],2) # 95% interval of crossover for differential susceptibility models if (!is.null(boot)){ # We must do bootstrap LEGIT_boot = function(d, i){ data_ = d[i,,drop=FALSE] genes_ = genes[i,,drop=FALSE] env_ = env[i,,drop=FALSE] diff_suscept_WEAK = LEGIT(data=data_, genes=genes_, env=env_, formula=formula_WEAK_nocrossover, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) diff_suscept_STRONG = LEGIT(data=data_, genes=genes_, env=env_, formula=formula_STRONG_nocrossover, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=FALSE, reverse_code=reverse_code, rescale=rescale, lme4=lme4) if (lme4) out = (c(-lme4::fixef(diff_suscept_WEAK$fit_main)[2]/lme4::fixef(diff_suscept_WEAK$fit_main)[4],-lme4::fixef(diff_suscept_STRONG$fit_main)[2]/lme4::fixef(diff_suscept_STRONG$fit_main)[3])) else out = (c(-coef(diff_suscept_WEAK$fit_main)[2]/coef(diff_suscept_WEAK$fit_main)[4],-coef(diff_suscept_STRONG$fit_main)[2]/coef(diff_suscept_STRONG$fit_main)[3])) return(out) } boot_results = boot::boot(data, LEGIT_boot, boot) crossover_interval_WEAK = round(boot::boot.ci(boot_results, index=1, type="bca")$bca[4:5],2)
crossover_interval_STRONG = round(boot::boot.ci(boot_results, index=2, type="bca")$bca[4:5],2) } else{ # Non-bootstrapped crossover_interval_WEAK = round(suppressMessages(stats::confint(diff_suscept_WEAK$fit_env,"crossover")),2)
crossover_interval_STRONG = round(suppressMessages(stats::confint(diff_suscept_STRONG$fit_env,"crossover")),2) } data_dw = diathesis_stress_WEAK$fit_env$data$E
data_ds = diathesis_stress_STRONG$fit_env$data$E data_vw = vantage_sensitivity_WEAK$fit_env$data$E
data_vs = vantage_sensitivity_STRONG$fit_env$data$E data_dfw = diff_suscept_WEAK$fit_env$data$E
data_dfs = diff_suscept_STRONG$fit_env$data$E data_E = diff_suscept_WEAK$fit_env$data$E

inside_WEAK = (crossover_interval_WEAK[1] > min(data_E) && crossover_interval_WEAK[2] < max(data_E))
inside_STRONG = (crossover_interval_STRONG[1] > min(data_E) && crossover_interval_STRONG[2] < max(data_E))
crossover_interval_WEAK = paste0("( ",crossover_interval_WEAK[1]," / ",crossover_interval_WEAK[2]," )")
crossover_interval_STRONG = paste0("( ",crossover_interval_STRONG[1]," / ",crossover_interval_STRONG[2]," )")

# Proportion of observations below crossover point
if (include_noGxE_models) prop_below = c(sum(data_vw <= crossover_vantage_sensitivity_WEAK)/NROW(data_vw), sum(data_vs <= crossover_vantage_sensitivity_STRONG)/NROW(data_vs), sum(data_dw <= crossover_diathesis_stress_WEAK)/NROW(data_dw), sum(data_ds <= crossover_diathesis_stress_STRONG)/NROW(data_ds), sum(data_dfw <= crossover_diff_suscept_WEAK)/NROW(data_dfw), sum(data_dfs <= crossover_diff_suscept_STRONG)/NROW(data_dfs), NA, NA, NA, NA)
else prop_below = c(sum(data_vw <= crossover_vantage_sensitivity_WEAK)/NROW(data_vw), sum(data_vs <= crossover_vantage_sensitivity_STRONG)/NROW(data_vs), sum(data_dw <= crossover_diathesis_stress_WEAK)/NROW(data_dw), sum(data_ds <= crossover_diathesis_stress_STRONG)/NROW(data_ds), sum(data_dfw <= crossover_diff_suscept_WEAK)/NROW(data_dfw), sum(data_dfs <= crossover_diff_suscept_STRONG)/NROW(data_dfs))

# Aggregating results
if (include_noGxE_models){
fits = list(vantage_sensitivity_WEAK = vantage_sensitivity_WEAK, vantage_sensitivity_STRONG = vantage_sensitivity_STRONG, diathesis_stress_WEAK = diathesis_stress_WEAK, diathesis_stress_STRONG = diathesis_stress_STRONG, diff_suscept_WEAK = diff_suscept_WEAK, diff_suscept_STRONG = diff_suscept_STRONG, model_E_only = model_E_only , model_intercept_only = model_intercept_only, model_GandE_only = model_GandE_only, model_G_only = model_G_only)[ordering]
results = cbind(model_criterion, crossover_models,cbind("","","","",crossover_interval_WEAK,crossover_interval_STRONG,"","","","")[ordering],cbind("","","","",c("No","Yes")[inside_WEAK+1],c("No","Yes")[inside_STRONG+1],"","","","")[ordering], prop_below[ordering])
rownames(results) = c("Vantage sensitivity WEAK","Vantage sensitivity STRONG","Diathesis-stress WEAK","Diathesis-stress STRONG","Differential susceptibility WEAK","Differential susceptibility STRONG", "E only", "Intercept only", "G + E only", "G only")[ordering]
}
else{
fits = list(vantage_sensitivity_WEAK = vantage_sensitivity_WEAK, vantage_sensitivity_STRONG = vantage_sensitivity_STRONG, diathesis_stress_WEAK = diathesis_stress_WEAK, diathesis_stress_STRONG = diathesis_stress_STRONG, diff_suscept_WEAK = diff_suscept_WEAK, diff_suscept_STRONG = diff_suscept_STRONG)[ordering]
results = cbind(model_criterion, crossover_models,cbind("","","","",crossover_interval_WEAK,crossover_interval_STRONG)[ordering],cbind("","","","",c("No","Yes")[inside_WEAK+1],c("No","Yes")[inside_STRONG+1])[ordering], prop_below[ordering])
rownames(results) = c("Vantage sensitivity WEAK","Vantage sensitivity STRONG","Diathesis-stress WEAK","Diathesis-stress STRONG","Differential susceptibility WEAK","Differential susceptibility STRONG")[ordering]
}
colnames(results)[1] = criterion
colnames(results)[2] = "crossover"
colnames(results)[3] = "crossover 95%"
colnames(results)[4] = "Within observable range?"
colnames(results)[5] = "% of observations below crossover"
return(list(fits = fits, results = results, E_range=c(min(data_E),max(data_E))))
}

#' @title Regions of significance using Johnson-Neyman technique
#' @description Constructs a LEGIT model and returns the regions of significance (RoS) with the predicted type of interaction (diathesis-stress, vantage-sensitivity, or differential susceptibility). RoS is not recommended due to poor accuracy with small samples and small effect sizes, GxE_interaction_test has much better accuracy overall. Only implemented for family=gaussian.
#' @param data data.frame of the dataset to be used.
#' @param genes data.frame of the variables inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic).
#' @param env data.frame of the variables inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental).
#' @param formula_noGxE formula WITHOUT \emph{G} or \emph{E} (y ~ covariates). \emph{G} and \emph{E} will automatically be added.
#' @param t_alpha Alpha level of the student-t distribution for the regions of significance (Default = .05)
#' @param start_genes Optional starting points for genetic score (must be the same length as the number of columns of \code{genes}).
#' @param start_env Optional starting points for environmental score (must be the same length as the number of columns of \code{env}).
#' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results).
#' @param maxiter Maximum number of iterations.
#' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution).
#' @param reverse_code If TRUE, after fitting the model, the genes with negative weights are reverse coded (ex: \eqn{g_rev} = 1 - \eqn{g}). It assumes that the original coding is in [0,1]. The purpose of this option is to prevent genes with negative weights which cause interpretation problems (ex: depression normally decreases attention but with a negative genetic score, it increases attention). Warning, using this option with GxG interactions could cause nonsensical results since GxG could be inverted. Also note that this may fail with certain models (Default=FALSE).
#' @param rescale If TRUE, the environmental variables are automatically rescaled to the range [-1,1]. This improves interpretability (Default=FALSE).
#' @return Returns a list containing the RoS and the predicted type of interaction.
#' @examples
#'	train = example_2way(500, 1, seed=777)
#'	ros = GxE_interaction_RoS(train$data, train$G, train$E, y ~ 1) #' ros #' @import formula.tools stats #' @references Alexia Jolicoeur-Martineau, Jay Belsky, Eszter Szekely, Keith F. Widaman, Michael Pluess, Celia Greenwood and Ashley Wazana. \emph{Distinguishing differential susceptibility, diathesis-stress and vantage sensitivity: beyond the single gene and environment model} (2017). psyarxiv.com/27uw8. 10.17605/OSF.IO/27UW8. #' @references Daniel J. Bauer & Patrick J. Curran. \emph{Probing Interactions in Fixed and Multilevel Regression: Inferential and Graphical Techniques} (2005). Multivariate Behavioral Research, 40:3, 373-400, DOI: 10.1207/s15327906mbr4003_5. #' @export GxE_interaction_RoS = function(data, genes, env, formula_noGxE, t_alpha = .05, start_genes=NULL, start_env=NULL, eps=.001, maxiter=100, ylim=NULL, reverse_code=FALSE, rescale=FALSE){ formula = stats::as.formula(formula_noGxE) formula = paste0(formula, " + G*E") fit_LEGIT = LEGIT(data=data, genes=genes, env=env, formula=formula, start_genes=start_genes, start_env=start_env, eps=eps, maxiter=maxiter, family=gaussian, ylim=ylim, print=FALSE, print_steps=FALSE, crossover = NULL, crossover_fixed = FALSE, reverse_code=reverse_code, rescale=rescale) fit = lm(formula, fit_LEGIT$fit_main$data) b1 = coef(fit)["G"] b3 = coef(fit)["G:E"] b1_var = vcov(fit)["G","G"] b3_var = vcov(fit)["G:E","G:E"] b1_b3_cov = vcov(fit)["G","G:E"] t_crit = qt(t_alpha/2, fit$df.residual, lower.tail = FALSE)

a = (t_crit^2)*b3_var - (b3^2)
b = 2*((t_crit^2)*b1_b3_cov - b1*b3)
c = (t_crit^2)*b1_var - (b1^2)
if ((b^2) - 4*a*c < 0) return(list(RoS=c(NA,NA), int_type="Undetermined"))
RoS_results = c((-b + sqrt((b^2) - 4*a*c))/(2*a), (-b - sqrt((b^2) - 4*a*c))/(2*a))
RoS_results = sort(RoS_results)
names(RoS_results)=NULL
Lower_bounded = TRUE
Upper_bounded = TRUE
if (RoS_results[1] < min(fit$model$E)) Lower_bounded = FALSE
if (RoS_results[2] > max(fit$model$E)) Upper_bounded = FALSE
if (Lower_bounded && Upper_bounded) int_type = "Differential susceptibility"
else if (!Lower_bounded && Upper_bounded) int_type = "Vantage sensitivity"
else if (Lower_bounded && !Upper_bounded) int_type = "Diathesis-stress"
else int_type = "Undetermined"
return(list(RoS=RoS_results, int_type=int_type))
}

#' @title Plot
#' @description Plot of LEGIT models. By default, variables that are not in \emph{G} or \emph{E} are fixed to the mean.
#' @param x An object of class "LEGIT", usually, a result of a call to LEGIT.
#' @param cov_values Vector of the values, for each covariate, that will be used in the plotting, if there are any covariates. It must contain the names of the variables. Covariates are the variables that are not \emph{G} nor \emph{E} but still are adjusted for in the model. By default, covariates are fixed to the mean.
#' @param gene_quant Vector of the genes quantiles used to make the plot. We use quantiles instead of fixed values because genetic scores can vary widely depending on the weights, thus looking at quantiles make this simpler. (Default = c(.025,.50,.975))
#' @param env_quant Vector of the environments quantiles used to make the plot. We use quantiles instead of fixed values because environmental scores can vary widely depending on the weights, thus looking at quantiles make this simpler. (Default = c(.025,.50,.975))
#' @param outcome_quant Vector of the outcome quantiles used to make the plot. We use quantiles instead of fixed values because environmental scores can vary widely depending on the weights, thus looking at quantiles make this simpler. (Default = c(.025,.50,.975))
#' @param cols Colors for the slopes with different genetic score. Must be a vector same length as "gene_range". (Default = c("#3288BD", "#CAB176", #D53E4F"))
#' @param ylab Y-axis label (Default = "Outcome")
#' @param xlab X-axis label (Default = "Environment")
#' @param legtitle Title of the Legend for the genes slopes label (Default = "Genetic score")
#' @param leglab Optional vector of labels of the Legend for the genes slopes label
#' @param cex.axis relative scale of axis (Default = 1.9)
#' @param cex.lab relative scale of labels (Default = 2)
#' @param cex.main relative scale overall (Default = 2.2)
#' @param cex.leg relative scale of legend (Default = 2.2)
#' @param xlim X-axis vector of size two with min and max (Default = NULL which leads to min="2.5 percentile" and max="97.5 percentile").
#' @param ylim Y-axis vector of size two with min and max (Default = NULL which leads to min="2.5 percentile" and max="97.5 percentile").
#' @param x_at specific ticks for the X-axis, first and last will be min and max respectively (Default = NULL which leads to 2.5, 50 and 97.5 percentiles).
#' @param y_at specific ticks for the Y-axis, first and last will be min and max respectively (Default = NULL which leads to 2.5, 50 and 97.5 percentiles).
#' @param legend The location may of the legend be specified by setting legend to a single keyword from the list "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right" and "center" (Default = "topleft").
#' @return Returns a list containing the different models (diathesis-stress, differential susceptibility and vantage sensitivity WEAK or STRONG) in order from best to worst for each selected criterion.
#' @param ... Further arguments passed to or from other methods.
#' @examples
#'	train = example_2way(500, 1, seed=777)
#'	fit = LEGIT(train$data, train$G, train$E, y ~ G*E, train$coef_G, train$coef_E) #' plot(fit) #' @import formula.tools graphics #' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111. #' @exportS3Method plot LEGIT plot.LEGIT = function(x, cov_values = NULL, gene_quant = c(.025,.50,.975), env_quant = c(.025,.50,.975), outcome_quant = c(.025,.50,.975), cols = c("#3288BD", "#CAB176", "#D53E4F"), ylab="Outcome", xlab="Environment", legtitle="Genetic score", leglab=NULL, xlim= NULL, ylim= NULL, x_at = NULL, y_at = NULL, cex.axis = 1.9, cex.lab=2, cex.main=2.2, cex.leg=2.2, legend="topleft", ...){ # Better names (need to use x for S3 class consistency) object = x formula = object$formula
lme4 = object$true_model_parameters$lme4

# formula stuff
formula = stats::as.formula(formula)
# Formula with no random effects
formula_norand = gsub("\\s", "", deparse(formula))
formula_norand = gsub("\\+\\(.*)","",formula_norand)
formula_norand = gsub("\\(.*)","",formula_norand)
formula_norand = as.formula(formula_norand)
# Formula with rand effect put as fixed effects, so model.frame works
formula_wrand = gsub("(","",formula, fixed=TRUE)
formula_wrand = gsub(")","",formula_wrand, fixed=TRUE)
formula_wrand = gsub("||","+",formula_wrand, fixed=TRUE)
formula_wrand = gsub("|","+",formula_wrand, fixed=TRUE)
formula_wrand = as.formula(formula_wrand)

# Quantile range
gene_range = as.numeric(quantile(object$fit_genes$data$G, gene_quant)) env_range = as.numeric(quantile(object$fit_env$data$E, env_quant))
formula_outcome = get.vars(formula)[1]
outcome_range = as.numeric(quantile(object$fit_env$data[formula_outcome][,], outcome_quant))
# Fix this attribute to prevent problems if trying to predict fit_main directly later on
if (!lme4) attr(object$fit_main$terms,"dataClasses")[attr(object$fit_main$terms,"dataClasses")=="nmatrix.1"] = "numeric"

# Defaults
if (is.null(ylim)){
if (is.null(y_at)) ylim = c(outcome_range[1], outcome_range[length(outcome_range)])
else ylim = c(y_at[1], y_at[length(y_at)])
}
else if (is.null(y_at)) y_at = seq(ylim[1], ylim[2], length.out=3)
if (is.null(xlim)){
if (is.null(x_at)) xlim = c(env_range[1], env_range[length(env_range)])
else xlim = c(x_at[1], x_at[length(x_at)])
}
else if (is.null(x_at)) x_at = seq(xlim[1], xlim[2], length.out=3)

# Plot
op <- par(mfrow=c(1,1), mar=c(5.1, 5.1, 5.1, 4.1))
graphics::plot(x=c(), y=c() ,ylab = ylab, xlab = xlab, ylim = ylim, xlim=xlim, cex.lab=cex.lab, cex.main=cex.main, pch=20, bty="n", xaxt="n", yaxt="n")
if (is.null(y_at)) graphics::axis(2, at=outcome_range, labels=paste0(outcome_quant*100,"%"), cex.axis = cex.axis)
else graphics::axis(2, at=y_at, cex.axis = cex.axis)
if (is.null(x_at)) graphics::axis(1, at=env_range, labels=paste0(env_quant*100,"%"), cex.axis = cex.axis)
else graphics::axis(1, at=x_at, cex.axis = cex.axis)
E = seq(xlim[1],xlim[2], length.out=101)
if (!is.null(object$crossover)) E_ = E - object$crossover
else E_ = E

# covariates
covariates = get.vars(formula_wrand)[-1]
covariates = covariates[covariates != "G" & covariates != "E"]
if (!is.null(cov_values) && length(cov_values)!=length(covariates)) stop("cov_values doesn't have the correct number of covariates")

# Predictions and plot
for (j in 1:length(gene_range)){
# making data for predictions
G = gene_range[j]
newdata_base = data.frame(E=E_, G=G)
newdata = newdata_base
for (i in 1:length(covariates)){
if (identical(covariates, character(0))) newdata = newdata
else if (is.null(cov_values)){
if (is.factor(object$fit_genes$data[covariates[i]])) newdata = cbind(newdata, rep(levels(object$fit_genes$data[covariates[i]])[1],101))
else newdata = cbind(newdata, rep(apply(object$fit_genes$data[covariates[i]], 2, mean),101))
colnames(newdata)[NCOL(newdata)] = covariates[i]
}
else{
newdata = cbind(newdata, rep(cov_values[i], 101))
colnames(newdata)[NCOL(newdata)] = names(cov_values)[i]
}
}
# Prediction lines
ilink <- family(object$fit_main)$linkinv
if (lme4){
preds <- predict(object$fit_main, newdata = newdata, type = "link", re.form=NA) graphics::lines(E,ilink(preds), col=cols[j], lwd=2) } else{ preds <- predict(object$fit_main, newdata = newdata, se.fit = TRUE, type = "link")
graphics::polygon(c(E,rev(E)),c(ilink(preds$fit-2*preds$se.fit),rev(ilink(preds$fit+2*preds$se.fit))),col=grDevices::adjustcolor(cols[j], alpha.f = 0.2),border = NA)
graphics::lines(E,ilink(preds$fit), col=cols[j], lwd=2) } } if (is.null(leglab)) leglab = paste0(gene_quant*100,"%") legend(legend, legend=leglab,col = cols, lty=1, lwd=3, xpd = TRUE, cex = cex.leg, title=legtitle) } #' @title Independent Multiple Latent Environmental & Genetic InTeraction (IMLEGIT) model #' @description Constructs a generalized linear model (glm) with latent variables using alternating optimization. This is an extension of the LEGIT model to accommodate more than 2 latent variables. #' @param data data.frame of the dataset to be used. #' @param latent_var list of data.frame. The elements of the list are the datasets used to construct each latent variable. For interpretability and proper convergence, not using the same variable in more than one latent variable is highly recommended. It is recommended to set names to the list elements to prevent confusion because otherwise, the latent variables will be named L1, L2, ... (See examples below for more details) #' @param formula Model formula. The names of \code{latent_var} can be used in the formula to represent the latent variables. If names(\code{latent_var}) is NULL, then L1, L2, ... can be used in the formula to represent the latent variables. Do not manually code interactions, write them in the formula instead (ex: G*E1*E2 or G:E1:E2). #' @param start_latent_var Optional list of starting points for each latent variable (The list must have the same length as the number of latent variables and each element of the list must have the same length as the number of variables of the corresponding latent variable). #' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results). #' @param maxiter Maximum number of iterations. #' @param family Outcome distribution and link function (Default = gaussian). #' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution). #' @param print If FALSE, nothing except warnings will be printed. (Default = TRUE). #' @return Returns an object of the class "IMLEGIT" which is list containing, in the following order: a glm fit of the main model, a list of the glm fits of the latent variables and a list of the true model parameters (AIC, BIC, rank, df.residual, null.deviance) for which the individual model parts (main, genetic, environmental) don't estimate properly. #' @examples #' train = example_2way(500, 1, seed=777) #' fit_best = IMLEGIT(train$data, list(G=train$G, E=train$E), y ~ G*E,
#'	list(train$coef_G, train$coef_E))
#'	fit_default = IMLEGIT(train$data, list(G=train$G, E=train$E), y ~ G*E) #' summary(fit_default) #' summary(fit_best) #' train = example_3way_3latent(500, 1, seed=777) #' fit_best = IMLEGIT(train$data, train$latent_var, y ~ G*E*Z, #' list(train$coef_G, train$coef_E, train$coef_Z))
#'	fit_default = IMLEGIT(train$data, train$latent_var, y ~ G*E*Z)
#'	summary(fit_default)
#'	summary(fit_best)
#' @import formula.tools stats
#' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111.
#' @export

IMLEGIT = function(data, latent_var, formula, start_latent_var=NULL, eps=.001, maxiter=100, family=gaussian, ylim=NULL, print=TRUE)
{
if (!is.null(ylim)){
if (!is.numeric(ylim) || length(ylim) !=2) stop("ylim must either be NULL or a numeric vector of size two")
}
# Setting up latent_var and checks
if (class(latent_var)!="list") stop("latent_var must be a list of datasets")
k = length(latent_var)
if (k==0) stop("latent_var cannot be an empty list")
if (is.null(names(latent_var))){
if (print) cat("You have not specified names for the latent variables, assigning names to latent_var is highly recommended to prevent confusion. For now, they will be named L1, L2, ...\n")
names(latent_var) = paste0("L",1:k)
}
for (i in 1:k){
latent_var[[i]] = as.matrix(data.frame(latent_var[[i]],fix.empty.names=FALSE))
if (sum(colnames(latent_var[[i]])=="") > 0){
if (print) cat(paste0("You have not specified column names for certain elements in ",names(latent_var)[i], ", elements of this latent variable will be named ",names(latent_var)[i],1,", ",names(latent_var)[i],2," ...\n"))
colnames(latent_var[[i]]) = paste0(names(latent_var)[i],1:NCOL(latent_var[[i]]))
}
}

# More checks
if (maxiter <= 0) warning("maxiter must be > 0")
if (k > 1) for (i in 1:(k-1)) if (NROW(latent_var[[i]]) != NROW(latent_var[[i+1]])) stop("Some datasets in latent_var don't have the same number of observations")
if(!is.null(start_latent_var)){
if (class(start_latent_var)!="list") stop("start_latent_var must be a lit of vectors (or NULL)")
if (k!=length(start_latent_var)) stop("start_latent_var must have the same size as latent_var")
for (i in 1:k){
if (!is.null(latent_var[[i]])){
if (NCOL(latent_var[[i]])!=length(start_latent_var[[i]])) stop("All elements of start_latent_var must either be NULL or have the same length as the number of the elements in its associated latent variable")
}
}
}
if (class(data) != "data.frame" && class(data) != "matrix") stop("data must be a data.frame")

# getting right formats
# Retaining only the needed variables from the dataset (need to set elements in latent_var for this to work, they will be replaced with their proper values later)
data=data.frame(data)
for (i in 1:k) data[,names(latent_var)[i]] = 0
data = stats::model.frame(formula, data=data, na.action=na.pass)
formula = stats::as.formula(formula)

if (sum(apply(data,2,is.numeric)) != NCOL(data)) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, latent_var[[1]], latent_var[[2]], ...)")
for (i in 1:k) if (sum(apply(latent_var[[i]],2,is.numeric)) != NCOL(latent_var[[i]])) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, latent_var[[1]], latent_var[[2]], ...)")

# remove missing data
comp = stats::complete.cases(data,latent_var[[1]])
if (k > 1) for (i in 2:k) comp = comp & stats::complete.cases(latent_var[[i]])
data = data[comp,, drop=FALSE]
for (i in 1:k) latent_var[[i]] = latent_var[[i]][comp,, drop=FALSE]
if (dim(data)[1] <= 0) stop("no valid observation without missing values")

#Adding empty variables in main dataset for latent_var
for (i in 1:k){
data[,colnames(latent_var[[i]])]=0
data[,paste0("R0_",i)]=0
}

# Setting up initial weighted latent_var
weights_latent_var_old = vector("list", k)
weights_latent_var = vector("list", k)
if (is.null(start_latent_var)){
for (i in 1:k) weights_latent_var[[i]] = rep(1/dim(latent_var[[i]])[2],dim(latent_var[[i]])[2])
}
else{
for (i in 1:k){
if (sum(abs(start_latent_var[[i]]))==0) weights_latent_var[[i]] = rep(1/dim(latent_var[[i]])[2],dim(latent_var[[i]])[2])
else weights_latent_var[[i]] = start_latent_var[[i]]/sum(abs(start_latent_var[[i]]))
}
}
for (i in 1:k) data[,names(latent_var)[i]] = latent_var[[i]]%*%weights_latent_var[[i]]

# Lists needed for later
index_with_latent_var = vector("list", k)
formula_withoutlatent_var  = vector("list", k)
formula_withlatent_var  = vector("list", k)
formula_step  = vector("list", k)
fit_ = vector("list", k)
names(fit_) = names(latent_var)

# Deconstructing formula into parts (With latent_var and without latent_var)
formula_full = stats::terms(formula,simplify=TRUE)
formula_outcome = get.vars(formula)[1]
formula_elem_ = attributes(formula_full)$term.labels # Adding white spaces before and after to recognize a "E" as opposed to another string like "Elephant" formula_elem = paste("", formula_elem_,"") for (i in 1:k) index_with_latent_var[[i]] = grepl(paste0(" ",names(latent_var)[i]," "),formula_elem, fixed=TRUE) | grepl(paste0(" ",names(latent_var)[i],":"),formula_elem, fixed=TRUE) | grepl(paste0(":",names(latent_var)[i],":"),formula_elem, fixed=TRUE) | grepl(paste0(":",names(latent_var)[i]," "),formula_elem, fixed=TRUE) data_expanded = stats::model.matrix(formula, data=data) if (colnames(data_expanded)[1] == "(Intercept)"){ formula_elem = c("1",formula_elem) for (i in 1:k) index_with_latent_var[[i]] = c(FALSE,index_with_latent_var[[i]]) } for (i in 1:k){ ## Formulas for reparametrizations in each steps formula_elem_withoutlatent_var = formula_elem[!index_with_latent_var[[i]]] formula_elem_withoutlatent_var[-length(formula_elem_withoutlatent_var)] = paste0(formula_elem_withoutlatent_var[-length(formula_elem_withoutlatent_var)], " + ") formula_withoutlatent_var[[i]] = paste0(formula_outcome, " ~ ", paste0(formula_elem_withoutlatent_var,collapse="")) if (formula_elem[1] != "1") formula_withoutlatent_var[[i]] = paste0(formula_withoutlatent_var[[i]], " - 1") formula_withoutlatent_var[[i]] = stats::as.formula(formula_withoutlatent_var[[i]]) formula_elem_withlatent_var = formula_elem[index_with_latent_var[[i]]] # Remove G elements from formula because we want (b1 + b2*E + ...)*G rather than b1*G + b2*E*G + ... formula_elem_withlatent_var = gsub(paste0(" ",names(latent_var)[i]," "),"1",formula_elem_withlatent_var, fixed=TRUE) formula_elem_withlatent_var = gsub(paste0(" ",names(latent_var)[i],":"),"",formula_elem_withlatent_var, fixed=TRUE) formula_elem_withlatent_var = gsub(paste0(":",names(latent_var)[i],":"),":",formula_elem_withlatent_var, fixed=TRUE) formula_elem_withlatent_var = gsub(paste0(":",names(latent_var)[i]," "),"",formula_elem_withlatent_var, fixed=TRUE) formula_elem_withlatent_var[-length(formula_elem_withlatent_var)] = paste0(formula_elem_withlatent_var[-length(formula_elem_withlatent_var)], " + ") formula_withlatent_var[[i]] = paste0(formula_outcome, " ~ ", paste0(formula_elem_withlatent_var,collapse="")) if (sum(grepl("1",formula_elem_withlatent_var, fixed=TRUE))==0) formula_withlatent_var[[i]] = paste0(formula_withlatent_var[[i]], " - 1") formula_withlatent_var[[i]] = stats::as.formula(formula_withlatent_var[[i]]) # Making formula for step i latent_var_names = colnames(latent_var[[i]]) latent_var_names[-length(latent_var[[i]])] = paste0(colnames(latent_var[[i]])[-length(latent_var[[i]])], " + ") formula_step[[i]] = paste0(formula_outcome, " ~ ", paste0(latent_var_names,collapse="")) formula_step[[i]] = paste0(formula_step[[i]], " offset(R0_",i,") - 1") formula_step[[i]] = stats::as.formula(formula_step[[i]]) } for (j in 1:maxiter){ ## Step a : fit main model fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE) conv_latent_var = TRUE for (i in 1:k){ if (NCOL(latent_var[[i]])>1){ # Reparametrizing variables for step i (estimating i-th latent_var) data_expanded_withoutlatent_var = stats::model.matrix(formula_withoutlatent_var[[i]], data=data) data[,paste0("R0_",i)] = data_expanded_withoutlatent_var%*%stats::coef(fit_a)[!index_with_latent_var[[i]]] data_expanded_withlatent_var = stats::model.matrix(formula_withlatent_var[[i]], data=data) R1 = data_expanded_withlatent_var%*%stats::coef(fit_a)[index_with_latent_var[[i]]] R1_latent_var = latent_var[[i]]*as.vector(R1) data[,colnames(latent_var[[i]])]=R1_latent_var ## Step i-th : fit model for i-th latent_var fit_[[i]] = stats::glm(formula_step[[i]], data=data, family=family, y=FALSE, model=FALSE) weights_latent_var_ = stats::coef(fit_[[i]]) # Updating latent_var estimates and checking convergence weights_latent_var_old[[i]] = weights_latent_var[[i]] weights_latent_var[[i]] = weights_latent_var_/sum(abs(weights_latent_var_)) data[,names(latent_var)[i]] = latent_var[[i]]%*%weights_latent_var[[i]] if(sqrt(sum((weights_latent_var_old[[i]]-weights_latent_var[[i]])^2)) < eps) conv_latent_var = conv_latent_var & TRUE else conv_latent_var = FALSE } else conv_latent_var = conv_latent_var & TRUE } if (conv_latent_var) break } # Rerunning last time and scaling to return as results fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE) warn = FALSE total_rank = 0 for (i in 1:k){ # Reparametrizing variables for step i (estimating i-th latent_var) data_expanded_withoutlatent_var = stats::model.matrix(formula_withoutlatent_var[[i]], data=data) data[,paste0("R0_",i)] = data_expanded_withoutlatent_var%*%stats::coef(fit_a)[!index_with_latent_var[[i]]] data_expanded_withlatent_var = stats::model.matrix(formula_withlatent_var[[i]], data=data) R1 = data_expanded_withlatent_var%*%stats::coef(fit_a)[index_with_latent_var[[i]]] R1_latent_var = latent_var[[i]]*as.vector(R1) data[,colnames(latent_var[[i]])]=R1_latent_var fit_[[i]] = stats::glm(formula_step[[i]], data=data, family=family, y=FALSE, model=FALSE) # Changing data temporarily to get model to make sure the model parameters sum to 1 temp = data[,colnames(latent_var[[i]])] data[,colnames(latent_var[[i]])] = data[,colnames(latent_var[[i]])]*sum(abs(stats::coef(fit_[[i]]))) fit_[[i]] = stats::glm(formula_step[[i]], data=data, family=family, y=FALSE, model=FALSE) data[,colnames(latent_var[[i]])] = temp data[,names(latent_var)[i]] = latent_var[[i]] %*% stats::coef(fit_[[i]]) if (abs(((fit_a$deviance-fit_[[i]]$deviance)/fit_a$deviance))>=.01 && !warn){
warning("Deviance differs by more than 1% between model parts. Make sure that everything was set up properly and try increasing the number of iterations (maxiter).")
warn = TRUE
}
total_rank = total_rank + fit_[[i]]$rank - 1 } # Make sure data make sense for user (and for plot function) fit_a$data = data
for (i in 1:k){
fit_[[i]]$data = data } #Change some arguments so that we get the right AIC, BIC and dispersion for the model true_rank = fit_a$rank + total_rank
# true_rank might be different from df of logLik, this is because glm() assume that variance components should be counted
# I don't agree but we need to follow the same guideline to make sure it matches with glm()
logLik_df =  attr(logLik(fit_a), "df") + total_rank

if (is.na(logLik(fit_a))){
# Quasi-GLM stuff
get_dispersion <- function(object) {
with(object,sum((weights * residuals^2)[weights > 0])/df.residual)
}
get_loglik <- function(object) {
-object$deviance / 2 } log_lik = get_loglik(fit_a) c_hat = get_dispersion(fit_a) } else{ log_lik = logLik(fit_a)[1] c_hat = 1 } true_aic = 2*logLik_df - (2*log_lik/c_hat) true_aicc = true_aic + ((2*logLik_df*(logLik_df+1))/(stats::nobs(fit_a)-logLik_df-1)) true_bic = log(stats::nobs(fit_a))*logLik_df - (2*log_lik/c_hat) true_df.residual = stats::nobs(fit_a) - true_rank true_null.deviance = fit_a$null.deviance

# print convergences stuff;
if (conv_latent_var){
if (print) cat(paste0("Converged in ",j, " iterations\n"))
}
else{
warning(paste0("Did not reach convergence in maxiter iterations. Try increasing maxiter or make eps smaller."))
}
if (is.null(ylim)) result = list(fit_main = fit_a, fit_latent_var = fit_, true_model_parameters=list(AIC = true_aic, AICc = true_aicc, BIC = true_bic, rank = true_rank, df.residual = true_df.residual, null.deviance=true_null.deviance))
else result = list(fit_main = fit_a, fit_latent_var = fit_, true_model_parameters=list(AIC = true_aic, AICc = true_aicc, BIC = true_bic, rank = true_rank, df.residual = true_df.residual, null.deviance=true_null.deviance), ylim=ylim)
class(result) <- "IMLEGIT"
return(result)
}

#' @title LEGIT to IMLEGIT
#' @description Transforms a LEGIT model into a IMLEGIT model (Useful if you want to do plot() or GxE_interaction_test() with a model resulting from a variable selection method which gave a IMLEGIT model)
#' @param fit LEGIT model
#' @param data data.frame of the dataset to be used.
#' @param genes data.frame of the variables inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic).
#' @param env data.frame of the variables inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental).
#' @param formula Model formula. Use \emph{E} for the environmental score and \emph{G} for the genetic score. Do not manually code interactions, write them in the formula instead (ex: G*E*z or G:E:z).
#' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results).
#' @param maxiter Maximum number of iterations.
#' @param family Outcome distribution and link function (Default = gaussian).
#' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution).
#' @param print If FALSE, nothing except warnings will be printed (Default = TRUE).
#' @return Returns an object of the class "IMLEGIT" which is list containing, in the following order: a glm fit of the main model, a list of the glm fits of the latent variables and a list of the true model parameters (AIC, BIC, rank, df.residual, null.deviance) for which the individual model parts (main, genetic, environmental) don't estimate properly.
#' @examples
#'	train = example_2way(500, 1, seed=777)
#'	fit = LEGIT(train$data, train$G, train$E, y ~ G*E, train$coef_G, train$coef_E) #' fit_IMLEGIT = LEGIT_to_IMLEGIT(fit,train$data, train$G, train$E, y ~ G*E)
#'	fit_LEGIT = IMLEGIT_to_LEGIT(fit_IMLEGIT,train$data, train$G, train$E, y ~ G*E) #' @import formula.tools stats #' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111. #' @export LEGIT_to_IMLEGIT = function(fit, data, genes, env, formula, eps=.001, maxiter=100, family=gaussian, ylim=NULL, print=TRUE){ fit = IMLEGIT(data = data, formula = formula, latent_var = list(G = genes[,names(coef(fit$fit_genes)),drop=FALSE], E = env[,names(coef(fit$fit_env)),drop=FALSE]), start_latent_var = list(coef(fit$fit_genes),coef(fit$fit_env)), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=print) return(fit) } #' @title IMLEGIT to LEGIT #' @description Transforms a IMLEGIT model into a LEGIT model #' @param fit IMLEGIT model #' @param data data.frame of the dataset to be used. #' @param genes data.frame of the variables inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic). #' @param env data.frame of the variables inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental). #' @param formula Model formula. Use \emph{E} for the environmental score and \emph{G} for the genetic score. Do not manually code interactions, write them in the formula instead (ex: G*E*z or G:E:z). #' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results). #' @param maxiter Maximum number of iterations. #' @param family Outcome distribution and link function (Default = gaussian). #' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution). #' @param print If FALSE, nothing except warnings will be printed (Default = TRUE). #' @return Returns an object of the class "LEGIT" which is list containing, in the following order: a glm fit of the main model, a glm fit of the genetic score, a glm fit of the environmental score, a list of the true model parameters (AIC, BIC, rank, df.residual, null.deviance) for which the individual model parts (main, genetic, environmental) don't estimate properly and the formula. #' @examples #' train = example_2way(500, 1, seed=777) #' fit = LEGIT(train$data, train$G, train$E, y ~ G*E, train$coef_G, train$coef_E)
#'	fit_IMLEGIT = LEGIT_to_IMLEGIT(fit,train$data, train$G, train$E, y ~ G*E) #' fit_LEGIT = IMLEGIT_to_LEGIT(fit_IMLEGIT,train$data, train$G, train$E, y ~ G*E)
#' @import formula.tools stats
#' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111.
#' @export

IMLEGIT_to_LEGIT = function(fit, data, genes, env, formula, eps=.001, maxiter=100, family=gaussian, ylim=NULL, print=TRUE){
fit = LEGIT(data = data, formula = formula, genes = genes[,names(coef(fit$fit_latent_var$G)),drop=FALSE], env = env[,names(coef(fit$fit_latent_var$E)),drop=FALSE], start_genes = coef(fit$fit_latent_var$G), start_env = coef(fit$fit_latent_var$E), eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=print)
return(fit)
}

#' @title Elastic net for variable selection in IMLEGIT model
#' @description [Fast and accurate, highly recommended] Apply Elastic Net (from the glmnet package) with IMLEGIT to obtain the order of variable removal that makes the most sense. The output shows the information criterion at every step, so you can decide which variable to retain. It is significantly faster (seconds/minutes instead of hours) than all other variable selection approaches (except for stepwise) and it is very accurate. Note that, as opposed to LEGIT/IMLEGIT, the parameters of variables inside the latent variables are not L1-normalized; instead, its the main model parameters which are L1-normalized. This is needed to make elastic net works. It doesn't matter in the end, because we only care about which variables were removed and we only output the IMLEGIT models without elastic net penalization.
#' @param data data.frame of the dataset to be used.
#' @param latent_var list of data.frame. The elements of the list are the datasets used to construct each latent variable. For interpretability and proper convergence, not using the same variable in more than one latent variable is highly recommended. It is recommended to set names to the list elements to prevent confusion because otherwise, the latent variables will be named L1, L2, ... (See examples below for more details)
#' @param formula Model formula. The names of \code{latent_var} can be used in the formula to represent the latent variables. If names(\code{latent_var}) is NULL, then L1, L2, ... can be used in the formula to represent the latent variables. Do not manually code interactions, write them in the formula instead (ex: G*E1*E2 or G:E1:E2).
#' @param latent_var_searched Optional If not null, you must specify a vector containing all indexes of the latent variables you want to use elastic net on. Ex: If latent_var=list(G=genes, E=env), specifying latent_var_search=c(1,2) will use both, latent_var_search=1 will only do it for G, and latent_var_search=2 will only do it for E.
#' @param cross_validation (Optional) If TRUE, will return cross-validation criterion (slower, but very good criterion).
#' @param alpha The elasticnet mixing parameter (between 0 and 1). 1 leads to lasso, 0 leads to ridge. See glmnet package manual for more information. We recommend somewhere betwen .50 and 1.
#' @param standardize If TRUE, standardize all variables inside every latent_var component. Note that if FALSE, glmnet will still standardize and unstandardize, but it will do so for each model (i.e., when at the step of estimating the parameters of latent variable G it standardize them, apply glmnet, then unstandarize them). This means that fixed parameters in the alternating steps are not standardized when standardize=FALSE. In practice, we found that standardize=FALSE leads to weird paths that do not always make sense. In the end, we only care about the order of the variable removal from the glmnet. We highly recommend standardize=TRUE for best results.
#' @param lambda_path Optional vector of all lambda (penalty term for elastic net, see glmnet package manual). By default, we automatically determine it.
#' @param lambda_mult scalar which multiplies the maximum lambda (penalty term for elastic net, see glmnet package manual) from the lambda path determined automatically. Sometimes, the maximum lambda found automatically is too big or too small and you may not want to spend the effort to manually set your own lambda path. This is where this comes in, you can simply scale lambda max up or down. (Default = 1)
#' @param lambda_min minimum lambda (penalty term for elastic net, see glmnet package manual) from the lambda path. (Default = .0001)
#' @param n_lambda Number of lambda (penalty term for elastic net, see glmnet package manual) in lambda path. Make lower for faster training, or higher for more precision. If you have many variables, make it bigger than 100 (Default = 100).
#' @param start_latent_var Optional list of starting points for each latent variable (The list must have the same length as the number of latent variables and each element of the list must have the same length as the number of variables of the corresponding latent variable).
#' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results).
#' @param maxiter Maximum number of iterations.
#' @param family Outcome distribution and link function (Default = gaussian).
#' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution).
#' @param cv_iter Number of cross-validation iterations (Default = 5).
#' @param cv_folds Number of cross-validation folds (Default = 10). Using \code{cv_folds=NROW(data)} will lead to leave-one-out cross-validation.
#' @param folds Optional list of vectors containing the fold number for each observation. Bypass cv_iter and cv_folds. Setting your own folds could be important for certain data types like time series or longitudinal data.
#' @param classification Set to TRUE if you are doing classification (binary outcome).
#' @param Huber_p Parameter controlling the Huber cross-validation error (Default = 1.345).
#' @param print If FALSE, nothing except warnings will be printed. (Default = TRUE).
#' @param test_only If TRUE, only uses the first fold for training and predict the others folds; do not train on the other folds. So instead of cross-validation, this gives you train/test and you get the test R-squared as output.
#' @return Returns an object of the class "elastic_net_var_select" which is list containing, in the following order: the criterion at each lambda, the coefficients of the latent variables at each lambda, the fits of each IMLEGIT models for each variable retained at each lambda, and the vector of lambda used.
#' @examples
#'	\dontrun{
#'	N = 1000
#'	train = example_3way(N, sigma=1, logit=FALSE, seed=7)
#'	train$G = cbind(train$G, g1_bad, g2_bad, g3_bad, g4_bad, g5_bad)
#'	lv = list(G=train$G, E=train$E)
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E) #' summary(fit) #' best_model(fit, criterion="BIC") #' # Instead of taking the best, if you want the model with "Model index"=17 from summary, do # fit_mychoice = fit$fit[[17]]
#'	plot(fit)
#'	# With Cross-validation
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E, cross_validation=TRUE, cv_iter=1, cv_folds=5) #' best_model(fit, criterion="cv_R2") #' # Elastic net only applied on G #' fit = elastic_net_var_select(train$data, lv, y ~ G*E, c(1))
#'	# Elastic net only applied on E
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E, c(2)) #' # Most E variables not removed, use lambda_mult > 1 to remove more #' fit = elastic_net_var_select(train$data, lv, y ~ G*E, c(2), lambda_mult=5)
#'	# Lasso (only L1 regularization)
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E, alpha=1) #' # Want more lambdas (useful if # of variables is large) #' fit = elastic_net_var_select(train$data, lv, y ~ G*E, n_lambda = 200)
#'	}
#' @import formula.tools stats
#' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111.
#' @export

elastic_net_var_select = function(data, latent_var, formula, latent_var_searched=NULL, cross_validation=FALSE, alpha=.75, standardize=TRUE, lambda_path=NULL, lambda_mult=1, lambda_min = .0001, n_lambda = 100, start_latent_var=NULL, eps=.001, maxiter=100, family=gaussian, ylim=NULL, cv_iter=5, cv_folds=10, folds=NULL, Huber_p=1.345, classification=FALSE, print=TRUE, test_only=FALSE)
{

if (!is.null(ylim)){
if (!is.numeric(ylim) || length(ylim) !=2) stop("ylim must either be NULL or a numeric vector of size two")
}
# Setting up latent_var and checks
if (class(latent_var)!="list") stop("latent_var must be a list of datasets")
k = length(latent_var)
if (k==0) stop("latent_var cannot be an empty list")
if (is.null(names(latent_var))){
if (print) cat("You have not specified names for the latent variables, assigning names to latent_var is highly recommended to prevent confusion. For now, they will be named L1, L2, ...\n")
names(latent_var) = paste0("L",1:k)
}
for (i in 1:k){
latent_var[[i]] = as.matrix(data.frame(latent_var[[i]],fix.empty.names=FALSE))
if (sum(colnames(latent_var[[i]])=="") > 0){
if (print) cat(paste0("You have not specified column names for certain elements in ",names(latent_var)[i], ", elements of this latent variable will be named ",names(latent_var)[i],1,", ",names(latent_var)[i],2," ...\n"))
colnames(latent_var[[i]]) = paste0(names(latent_var)[i],1:NCOL(latent_var[[i]]))
}
}
# More checks
if (maxiter <= 0) warning("maxiter must be > 0")
if (k > 1) for (i in 1:(k-1)) if (NROW(latent_var[[i]]) != NROW(latent_var[[i+1]])) stop("Some datasets in latent_var don't have the same number of observations")
if(!is.null(start_latent_var)){
if (class(start_latent_var)!="list") stop("start_latent_var must be a lit of vectors (or NULL)")
if (k!=length(start_latent_var)) stop("start_latent_var must have the same size as latent_var")
for (i in 1:k){
if (!is.null(latent_var[[i]])){
if (NCOL(latent_var[[i]])!=length(start_latent_var[[i]])) stop("All elements of start_latent_var must either be NULL or have the same length as the number of the elements in its associated latent variable")
}
}
}
if (class(data) != "data.frame" && class(data) != "matrix") stop("data must be a data.frame")

# getting right formats
# Retaining only the needed variables from the dataset (need to set elements in latent_var for this to work, they will be replaced with their proper values later)
data=data.frame(data)
for (i in 1:k) data[,names(latent_var)[i]] = 0
data = stats::model.frame(formula, data=data, na.action=na.pass)
formula = stats::as.formula(formula)

if (sum(apply(data,2,is.numeric)) != NCOL(data)) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, latent_var[[1]], latent_var[[2]], ...)")
for (i in 1:k) if (sum(apply(latent_var[[i]],2,is.numeric)) != NCOL(latent_var[[i]])) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, latent_var[[1]], latent_var[[2]], ...)")

# remove missing data
comp = stats::complete.cases(data,latent_var[[1]])
if (k > 1) for (i in 2:k) comp = comp & stats::complete.cases(latent_var[[i]])
data = data[comp,, drop=FALSE]
for (i in 1:k) latent_var[[i]] = latent_var[[i]][comp,, drop=FALSE]
if (dim(data)[1] <= 0) stop("no valid observation without missing values")

#Adding empty variables in main dataset for latent_var
for (i in 1:k){
data[,colnames(latent_var[[i]])]=0
data[,paste0("R0_",i)]=0
}

formula_outcome = get.vars(formula)[1]
if (is.null(latent_var_searched)) latent_var_searched = c(1:k)

## Standardize variables: (need to use n instead of (n-1) as denominator)
mysd <- function(y) sqrt(sum((y-mean(y))^2)/length(y))

#### Lambda path (taken from https://stackoverflow.com/questions/23686067/default-lambda-sequence-in-glmnet-for-cross-validation)
if (is.null(lambda_path)){

# We find lambda max for all latent_var and use the maximum
n = dim(data)[1]
lambda_max = -Inf
for (i in latent_var_searched){
sx = scale(latent_var[[i]], scale = apply(latent_var[[i]], 2, mysd))
sx = as.matrix(sx, ncol = NCOL(latent_var[[i]]), nrow = n)
sy = as.vector(scale(data[,formula_outcome], scale = mysd(data[,formula_outcome])))
lambda_max_current = max(lambda_mult*abs(colSums(sx*sy)))/n # Multiplying by k because otherwise its too small
if (lambda_max < lambda_max_current) lambda_max = lambda_max_current
}

## Calculate lambda path (first get lambda_max):
lambda_path = round(exp(seq(log(lambda_max), log(lambda_max*lambda_min), length.out = n_lambda)), digits = 10)
}

if (standardize){
for (i in 1:k){
sx = scale(latent_var[[i]], scale = apply(latent_var[[i]], 2, mysd))
latent_var[[i]] = as.matrix(sx, ncol = NCOL(latent_var[[i]]), nrow = dim(data)[1])
}
}

if (cross_validation){
if (classification) results = matrix(NA,length(lambda_path),7)
else results = matrix(NA,length(lambda_path),6)
}
else results = matrix(NA,length(lambda_path),3)
if (cross_validation & classification) colnames(results) = c("AIC","AICc","BIC","cv_R2","cv_Huber","cv_L1","cv_AUC")
else if (cross_validation & !classification) colnames(results) = c("AIC","AICc","BIC","cv_R2","cv_Huber","cv_L1")
else colnames(results) = c("AIC","AICc","BIC")
fit = vector("list", length(lambda_path))
glmnet_coef = vector("list", length(lambda_path))
n_var_zero_prev = -Inf
for (i in 1:length(lambda_path)){
result = IMLEGIT_net(data=data, latent_var=latent_var, formula=formula, latent_var_searched=latent_var_searched, cross_validation = FALSE, alpha=alpha, lambda=lambda_path[i], start_latent_var=start_latent_var, eps=eps, maxiter=maxiter, family=family, family_string=as.character(substitute(family)), ylim=ylim, print=FALSE, warn=FALSE)

# Only do cross-validation, if worth it (i.e., if a variable has been now been added to the list of the ones used). This reduces computation.
n_var_zero = sum(ceiling(abs(result$glmnet_coef))==0) if (cross_validation & n_var_zero_prev != n_var_zero & !is.null(result$fit)) result = IMLEGIT_net(data=data, latent_var=latent_var, formula=formula, latent_var_searched=latent_var_searched, classification = classification, cross_validation = TRUE, alpha=alpha, lambda=lambda_path[i], start_latent_var=start_latent_var, eps=eps, maxiter=maxiter, family=family, family_string=as.character(substitute(family)), ylim=ylim, print=FALSE, warn=FALSE, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, test_only = test_only)
n_var_zero_prev = n_var_zero

if (!is.null(result$fit)) fit[[i]] = result$fit
glmnet_coef[[i]] = result$glmnet_coef if (is.null(fit[[i]])){ if (cross_validation & classification) results[i,] = rep(NA, 7) else if (cross_validation & !classification) results[i,] = rep(NA, 6) else results[i,] = rep(NA, 3) } else{ if (cross_validation){ if (is.null(result$fit_cv)) R2_cv = results[max(i-1,1),"cv_R2"]
else R2_cv = mean(result$fit_cv$R2_cv)
if (is.null(result$fit_cv)) R2_h = results[max(i-1,1),"cv_Huber"] else R2_h = mean(result$fit_cv$Huber_cv) if (is.null(result$fit_cv)) R2_l1 = results[max(i-1,1),"cv_L1"]
else R2_l1 = mean(result$fit_cv$L1_cv)
if (classification){
if (is.null(result$fit_cv)) AUC = results[max(i-1,1),"cv_AUC"] else AUC = mean(result$fit_cv$AUC) } } if (cross_validation & classification) results[i,] = c(fit[[i]]$true_model_parameters$AIC, fit[[i]]$true_model_parameters$AICc, fit[[i]]$true_model_parameters$BIC, R2_cv, R2_h, R2_l1, AUC) else if (cross_validation & !classification) results[i,] = c(fit[[i]]$true_model_parameters$AIC, fit[[i]]$true_model_parameters$AICc, fit[[i]]$true_model_parameters$BIC, R2_cv,R2_h, R2_l1) else results[i,] = c(fit[[i]]$true_model_parameters$AIC, fit[[i]]$true_model_parameters$AICc, fit[[i]]$true_model_parameters$BIC) } } out = list(results=results, fit=fit, glmnet_coef=glmnet_coef, lambda_path=lambda_path) class(out) = "elastic_net_var_select" return(out) } #' @title Independent Multiple Latent Environmental & Genetic InTeraction (IMLEGIT) model with Elastic Net on the latent variables. Do not use on it's own, use elastic_net_var_select instead. #' @description Constructs a generalized linear model (glm) with latent variables using alternating optimization. This is an extension of the LEGIT model to accommodate more than 2 latent variables. Note that, as opposed to LEGIT/IMLEGIT, the parameters of variables inside the latent variables are not L1-normalized; instead, its the main model parameters which are L1-normalized. This is needed to make elastic net works. It doesn't matter in the end, because we only care about which variables were removed and we only give the IMLEGIT models without elastic net penalization. #' @param data data.frame of the dataset to be used. #' @param latent_var list of data.frame. The elements of the list are the datasets used to construct each latent variable. For interpretability and proper convergence, not using the same variable in more than one latent variable is highly recommended. It is recommended to set names to the list elements to prevent confusion because otherwise, the latent variables will be named L1, L2, ... (See examples below for more details) #' @param formula Model formula. The names of \code{latent_var} can be used in the formula to represent the latent variables. If names(\code{latent_var}) is NULL, then L1, L2, ... can be used in the formula to represent the latent variables. Do not manually code interactions, write them in the formula instead (ex: G*E1*E2 or G:E1:E2). #' @param latent_var_searched Optional If not null, you must specify a vector containing all indexes of the latent variables you want to use elastic net on. Ex: If latent_var=list(G=genes, E=env), specifying latent_var_search=c(1,2) will use both, latent_var_search=1 will only do it for G, and latent_var_search=2 will only do it for E. #' @param cross_validation If TRUE, will return cross-validation criterion (slower) #' @param alpha The elasticnet mixing parameter (between 0 and 1). 1 leads to lasso, 0 leads to ridge. See glmnet package manual for more information. We recommend somewhere betwen .50 and 1. #' @param lambda Lambda (penalty term for elastic net, see glmnet package manual) (Default = .0001) #' @param start_latent_var Optional list of starting points for each latent variable (The list must have the same length as the number of latent variables and each element of the list must have the same length as the number of variables of the corresponding latent variable). #' @param eps Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results). #' @param maxiter Maximum number of iterations. #' @param family Outcome distribution and link function (Default = gaussian). #' @param ylim Optional vector containing the known min and max of the outcome variable. Even if your outcome is known to be in [a,b], if you assume a Gaussian distribution, predict() could return values outside this range. This parameter ensures that this never happens. This is not necessary with a distribution that already assumes the proper range (ex: [0,1] with binomial distribution). #' @param cv_iter Number of cross-validation iterations (Default = 5). #' @param cv_folds Number of cross-validation folds (Default = 10). Using \code{cv_folds=NROW(data)} will lead to leave-one-out cross-validation. #' @param folds Optional list of vectors containing the fold number for each observation. Bypass cv_iter and cv_folds. Setting your own folds could be important for certain data types like time series or longitudinal data. #' @param classification Set to TRUE if you are doing classification (binary outcome). #' @param Huber_p Parameter controlling the Huber cross-validation error (Default = 1.345). #' @param print If FALSE, nothing except warnings will be printed. (Default = TRUE). #' @param warn If FALSE, it will not show warnings when all variables inside a latent variable are removed. This serves to prevent lots of warning when running elastic_net_var_select (Default = TRUE). #' @param family_string Optional String version of the family (gaussian leads to "gaussian"). This is only needed when using elastic_net_var_select. Please ignore this. #' @param test_only If TRUE, only uses the first fold for training and predict the others folds; do not train on the other folds. So instead of cross-validation, this gives you train/test and you get the test R-squared as output. #' @return Returns a list containing, in the following order: a IMLEGIT model, the coefficients of the variables in the latent variables from glmnet models, and the cross-validation results (if asked). #' @import formula.tools stats #' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111. #' @export IMLEGIT_net = function(data, latent_var, formula, latent_var_searched=NULL, cross_validation=FALSE, alpha=1, lambda=.0001, start_latent_var=NULL, eps=.001, maxiter=100, family=gaussian, ylim=NULL, cv_iter=5, cv_folds=10, folds=NULL, Huber_p=1.345, classification=FALSE, print=TRUE, warn=TRUE, family_string=NULL, test_only=FALSE) { if (!is.null(ylim)){ if (!is.numeric(ylim) || length(ylim) !=2) stop("ylim must either be NULL or a numeric vector of size two") } # Setting up latent_var and checks if (class(latent_var)!="list") stop("latent_var must be a list of datasets") k = length(latent_var) if (k==0) stop("latent_var cannot be an empty list") if (is.null(names(latent_var))){ if (print) cat("You have not specified names for the latent variables, assigning names to latent_var is highly recommended to prevent confusion. For now, they will be named L1, L2, ...\n") names(latent_var) = paste0("L",1:k) } for (i in 1:k){ latent_var[[i]] = as.matrix(data.frame(latent_var[[i]],fix.empty.names=FALSE)) if (sum(colnames(latent_var[[i]])=="") > 0){ if (print) cat(paste0("You have not specified column names for certain elements in ",names(latent_var)[i], ", elements of this latent variable will be named ",names(latent_var)[i],1,", ",names(latent_var)[i],2," ...\n")) colnames(latent_var[[i]]) = paste0(names(latent_var)[i],1:NCOL(latent_var[[i]])) } } # More checks if (maxiter <= 0) warning("maxiter must be > 0") if (k > 1) for (i in 1:(k-1)) if (NROW(latent_var[[i]]) != NROW(latent_var[[i+1]])) stop("Some datasets in latent_var don't have the same number of observations") if(!is.null(start_latent_var)){ if (class(start_latent_var)!="list") stop("start_latent_var must be a lit of vectors (or NULL)") if (k!=length(start_latent_var)) stop("start_latent_var must have the same size as latent_var") for (i in 1:k){ if (!is.null(latent_var[[i]])){ if (NCOL(latent_var[[i]])!=length(start_latent_var[[i]])) stop("All elements of start_latent_var must either be NULL or have the same length as the number of the elements in its associated latent variable") } } } if (class(data) != "data.frame" && class(data) != "matrix") stop("data must be a data.frame") # getting right formats # Retaining only the needed variables from the dataset (need to set elements in latent_var for this to work, they will be replaced with their proper values later) data=data.frame(data) for (i in 1:k) data[,names(latent_var)[i]] = 0 data = stats::model.frame(formula, data=data, na.action=na.pass) formula = stats::as.formula(formula) # Error message about factors if (sum(apply(data,2,is.numeric)) != NCOL(data)) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, latent_var[[1]], latent_var[[2]], ...)") for (i in 1:k) if (sum(apply(latent_var[[i]],2,is.numeric)) != NCOL(latent_var[[i]])) stop("All variables used must be numeric, factors are not allowed. Please dummy code all categorical variables inside your datasets (data, latent_var[[1]], latent_var[[2]], ...)") # remove missing data comp = stats::complete.cases(data,latent_var[[1]]) if (k > 1) for (i in 2:k) comp = comp & stats::complete.cases(latent_var[[i]]) data = data[comp,, drop=FALSE] for (i in 1:k) latent_var[[i]] = latent_var[[i]][comp,, drop=FALSE] if (dim(data)[1] <= 0) stop("no valid observation without missing values") #Adding empty variables in main dataset for latent_var for (i in 1:k){ data[,colnames(latent_var[[i]])]=0 data[,paste0("R0_",i)]=0 } # Setting up initial weighted latent_var weights_latent_var_old = vector("list", k) weights_latent_var = vector("list", k) if (is.null(start_latent_var)){ for (i in 1:k) weights_latent_var[[i]] = rep(1/dim(latent_var[[i]])[2],dim(latent_var[[i]])[2]) } else{ for (i in 1:k){ if (sum(abs(start_latent_var[[i]]))==0) weights_latent_var[[i]] = rep(1/dim(latent_var[[i]])[2],dim(latent_var[[i]])[2]) else weights_latent_var[[i]] = start_latent_var[[i]]/sum(abs(start_latent_var[[i]])) } } for (i in 1:k) data[,names(latent_var)[i]] = latent_var[[i]]%*%weights_latent_var[[i]] # Lists needed for later index_with_latent_var = vector("list", k) formula_withoutlatent_var = vector("list", k) formula_withlatent_var = vector("list", k) formula_step = vector("list", k) fit_ = vector("list", k) names(fit_) = names(latent_var) # Deconstructing formula into parts (With latent_var and without latent_var) formula_full = stats::terms(formula,simplify=TRUE) formula_outcome = get.vars(formula)[1] formula_elem_ = attributes(formula_full)$term.labels
# Adding white spaces before and after to recognize a "E" as opposed to another string like "Elephant"
formula_elem = paste("", formula_elem_,"")
for (i in 1:k) index_with_latent_var[[i]] = grepl(paste0(" ",names(latent_var)[i]," "),formula_elem, fixed=TRUE) | grepl(paste0(" ",names(latent_var)[i],":"),formula_elem, fixed=TRUE) | grepl(paste0(":",names(latent_var)[i],":"),formula_elem, fixed=TRUE) | grepl(paste0(":",names(latent_var)[i]," "),formula_elem, fixed=TRUE)
data_expanded = stats::model.matrix(formula, data=data)
if (colnames(data_expanded)[1] == "(Intercept)"){
intercept=TRUE
formula_elem = c("1",formula_elem)
for (i in 1:k) index_with_latent_var[[i]] = c(FALSE,index_with_latent_var[[i]])
}
else intercept=FALSE

for (i in 1:k){
## Formulas for reparametrizations in each steps
formula_elem_withoutlatent_var = formula_elem[!index_with_latent_var[[i]]]
formula_elem_withoutlatent_var[-length(formula_elem_withoutlatent_var)] = paste0(formula_elem_withoutlatent_var[-length(formula_elem_withoutlatent_var)], " + ")
formula_withoutlatent_var[[i]] = paste0(formula_outcome, " ~ ", paste0(formula_elem_withoutlatent_var,collapse=""))
if (formula_elem[1] != "1") formula_withoutlatent_var[[i]] = paste0(formula_withoutlatent_var[[i]], " - 1")
formula_withoutlatent_var[[i]] = stats::as.formula(formula_withoutlatent_var[[i]])

formula_elem_withlatent_var = formula_elem[index_with_latent_var[[i]]]
# Remove G elements from formula because we want (b1 + b2*E + ...)*G rather than b1*G + b2*E*G + ...
formula_elem_withlatent_var = gsub(paste0(" ",names(latent_var)[i]," "),"1",formula_elem_withlatent_var, fixed=TRUE)
formula_elem_withlatent_var = gsub(paste0(" ",names(latent_var)[i],":"),"",formula_elem_withlatent_var, fixed=TRUE)
formula_elem_withlatent_var = gsub(paste0(":",names(latent_var)[i],":"),":",formula_elem_withlatent_var, fixed=TRUE)
formula_elem_withlatent_var = gsub(paste0(":",names(latent_var)[i]," "),"",formula_elem_withlatent_var, fixed=TRUE)
formula_elem_withlatent_var[-length(formula_elem_withlatent_var)] = paste0(formula_elem_withlatent_var[-length(formula_elem_withlatent_var)], " + ")
formula_withlatent_var[[i]] = paste0(formula_outcome, " ~ ", paste0(formula_elem_withlatent_var,collapse=""))
if (sum(grepl("1",formula_elem_withlatent_var, fixed=TRUE))==0) formula_withlatent_var[[i]] = paste0(formula_withlatent_var[[i]], " - 1")
formula_withlatent_var[[i]] = stats::as.formula(formula_withlatent_var[[i]])

# Making formula for step i
latent_var_names = colnames(latent_var[[i]])
latent_var_names[-length(latent_var[[i]])] = paste0(colnames(latent_var[[i]])[-length(latent_var[[i]])], " + ")
formula_step[[i]] = paste0(formula_outcome, " ~ ", paste0(latent_var_names,collapse=""))
formula_step[[i]] = paste0(formula_step[[i]], " offset(R0_",i,") - 1")
formula_step[[i]] = stats::as.formula(formula_step[[i]])
}

if (is.null(latent_var_searched)) latent_var_searched = c(1:k)

done = FALSE
for (j in 1:maxiter){
## Step a : fit main model
fit_a = stats::glm(formula, data=data, family=family, y=FALSE, model=FALSE)
# L1 standardize the parameters of the main model
weights_a = stats::coef(fit_a)
weights_a = weights_a/sum(abs(weights_a))

conv_latent_var = TRUE
for (i in 1:k){
if (NCOL(latent_var[[i]])>1){
# Reparametrizing variables for step i (estimating i-th latent_var)
data_expanded_withoutlatent_var = stats::model.matrix(formula_withoutlatent_var[[i]], data=data)
data[,paste0("R0_",i)] = data_expanded_withoutlatent_var%*%weights_a[!index_with_latent_var[[i]]]
data_expanded_withlatent_var = stats::model.matrix(formula_withlatent_var[[i]], data=data)
R1 = data_expanded_withlatent_var%*%weights_a[index_with_latent_var[[i]]]
R1_latent_var = latent_var[[i]]*as.vector(R1)
data[,colnames(latent_var[[i]])]=R1_latent_var

## Step i-th : fit model for i-th latent_var
if (mean(is.na(R1))){
if (warn) warning(paste0("Lambda is too big, one latent variable has weight 0 everywhere. Use a smaller lambda."))
weights_latent_var[[i]] = rep(0, NCOL(latent_var[[i]]))
return(list(fit=NULL, glmnet_coef=unlist(weights_latent_var)))
}
else{

if (i %in% latent_var_searched){ #glmnet
if (is.null(family_string)) fit_[[i]] = glmnet::glmnet(as.matrix(latent_var[[i]]), data[,formula_outcome], family=as.character(substitute(family)), alpha=alpha, lambda=lambda, offset=data$R0_1, intercept=FALSE, standardize=FALSE) else fit_[[i]] = glmnet::glmnet(as.matrix(latent_var[[i]]), data[,formula_outcome], family=family_string, alpha=alpha, lambda=lambda, offset=data$R0_1, intercept=FALSE)
weights_latent_var_ = as.numeric(stats::coef(fit_[[i]])[-1]) # removing the 0 intercept
}
else{ #glm
fit_[[i]] = stats::glm(formula_step[[i]], data=data, family=family, y=FALSE, model=FALSE)
weights_latent_var_ = stats::coef(fit_[[i]])
}

# Updating latent_var estimates and checking convergence
weights_latent_var_old[[i]] = weights_latent_var[[i]]
if (i %in% latent_var_searched) weights_latent_var[[i]] = weights_latent_var_
else weights_latent_var[[i]] = weights_latent_var_/sum(abs(weights_latent_var_))
data[,names(latent_var)[i]] = latent_var[[i]]%*%weights_latent_var[[i]]
if(sqrt(sum((weights_latent_var_old[[i]]-weights_latent_var[[i]])^2)) < eps) conv_latent_var = conv_latent_var & TRUE
else conv_latent_var = FALSE
#print(weights_latent_var_)
}
}
else conv_latent_var = conv_latent_var & TRUE
}
if (conv_latent_var) break
}

# Rerunning last time and scaling to return as results
# We simply fit a IMLEGIT with the same starting points (they are automatically standardized by IMLEGIT).
removed = rep(FALSE,k)
weights_latent_var_backup = weights_latent_var
for (i in 1:k){
names(weights_latent_var_backup[[i]]) = colnames(latent_var[[i]]) # backup names
latent_var[[i]] = latent_var[[i]][,colnames(latent_var[[i]])[weights_latent_var[[i]]!=0],drop=FALSE] # Returns only the elements of each latent var which has weight non-zero
weights_latent_var[[i]] = weights_latent_var[[i]][weights_latent_var[[i]]!=0]
}
result = IMLEGIT(data=data, latent_var=latent_var, formula=formula, start_latent_var=weights_latent_var, eps=eps, maxiter=maxiter, family=family, ylim=ylim, print=print)
if (cross_validation) result_cv = IMLEGIT_cv(data=data, latent_var=latent_var, formula=formula, start_latent_var=weights_latent_var, eps=eps, maxiter=maxiter, family=family, ylim=ylim, cv_iter=cv_iter, cv_folds=cv_folds, folds=folds, Huber_p=Huber_p, classification=classification, test_only = test_only)
else result_cv = NULL
# print convergences stuff;
if (conv_latent_var){
if (print) cat(paste0("Converged in ",j, " iterations\n"))
}
else{
warning(paste0("Did not reach convergence in maxiter iterations. Try increasing maxiter or make eps smaller."))
}
return(list(fit=result, glmnet_coef=unlist(weights_latent_var_backup), fit_cv=result_cv))
}

#' @title Plot function for the output of elastic_net_var_select
#' @description Plot of the coefficients of variables inside the latent variables with respect to the log(lambda). This is your typical elastic-net plot.
#' @param x An object of class "elastic_net_var_select", usually, a result of a call to elastic_net_var_select.
#' @param lwd Thickness of the lines (Default = 2)
#' @param start At which lambda to start (from large lambda to small lambda). If start is not 1, we remove some of the large lambda, this can make plot easier to visualize (Default = 1).
#' @param ... Further arguments passed to or from other methods.
#' @return Returns the plot of the coefficients of variables inside the latent variables with respect to the log(lambda).
#' @examples
#'	\dontrun{
#'	N = 1000
#'	train = example_3way(N, sigma=1, logit=FALSE, seed=7)
#'	train$G = cbind(train$G, g1_bad, g2_bad, g3_bad, g4_bad, g5_bad)
#'	lv = list(G=train$G, E=train$E)
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E) #' summary(fit) #' best_model(fit, criterion="BIC") #' # Instead of taking the best, if you want the model with "Model index"=17 from summary, do # fit_mychoice = fit$fit[[17]]
#'	plot(fit)
#'	# With Cross-validation
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E, cross_validation=TRUE, cv_iter=1, cv_folds=5) #' best_model(fit, criterion="cv_R2") #' # Elastic net only applied on G #' fit = elastic_net_var_select(train$data, lv, y ~ G*E, c(1))
#'	# Elastic net only applied on E
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E, c(2)) #' # Most E variables not removed, use lambda_mult > 1 to remove more #' fit = elastic_net_var_select(train$data, lv, y ~ G*E, c(2), lambda_mult=5)
#'	# Lasso (only L1 regularization)
#'	fit = elastic_net_var_select(train$data, lv, y ~ G*E, alpha=1) #' # Want more lambdas (useful if # of variables is large) #' fit = elastic_net_var_select(train$data, lv, y ~ G*E, n_lambda = 200)
#'	}
#' @import formula.tools stats
#' @references Alexia Jolicoeur-Martineau, Ashley Wazana, Eszter Szekely, Meir Steiner, Alison S. Fleming, James L. Kennedy, Michael J. Meaney, Celia M.T. Greenwood and the MAVAN team. \emph{Alternating optimization for GxE modelling with weighted genetic and environmental scores: examples from the MAVAN study} (2017). arXiv:1703.08111.
#' @exportS3Method plot elastic_net_var_select

plot.elastic_net_var_select = function(x, lwd=2, start=1, ...){
object = x
n = length(object$glmnet_coef) k = length(object$fit[[length(object$glmnet_coef)]]$fit_latent_var)
lty_ = c()
n_var = vector("list", k)
n_var_total = 0
for (i in 1:k){
if (is.null(object$fit[[length(object$glmnet_coef)]])) stop("No fit to plot. This must be because all your choices of lambda were extremely large.")
n_var[[i]] = length(coef(object$fit[[length(object$glmnet_coef)]]$fit_latent_var[[i]])) lty_ = c(lty_, rep(i,n_var[[i]])) n_var_total = n_var_total + n_var[[i]] } A = matrix(unlist(object$glmnet_coef), ncol = n_var_total, byrow = TRUE)
path = object$lambda_path[start:n] A = A[start:n,] matplot(path,A,type="l", col=RColorBrewer::brewer.pal(n_var_total,"Paired"), xlab="log(lambda)", ylab="Coefficients", lty=lty_, lwd=lwd) abline(0,0, col="grey", lty=3) legend("topright",legend=names(object$glmnet_coef[[n]]), col=RColorBrewer::brewer.pal(n_var_total,"Paired"), lty=lty_, bg="white", lwd=lwd)
}

#' @title Summary function for the output of elastic_net_var_select
#' @description Summary function for the output of