## @knitr methods
# source("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/ggmix/simulation/packages.R")
# source("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/ggmix/simulation/functions.R")
source("/home/sahir/git_repositories/ggmix/simulation/packages.R")
source("/home/sahir/git_repositories/ggmix/simulation/functions.R")
# lasso -------------------------------------------------------------------
lasso <- new_method("lasso", "lasso",
method = function(model, draw) {
# browser()
fitglmnet <- glmnet::glmnet(x = draw[["xtrain_lasso"]],
y = draw[["ytrain"]],
alpha = 1,
nlambda = 50,
standardize = FALSE,
penalty.factor = c(rep(1, ncol(draw[["xtrain"]])), rep(0,10)))
# gicp <- -2*fitglmnet$nulldev*fitglmnet$dev.ratio + log(log(length(draw[["ytrain"]])))*log(ncol(draw[["xtrain"]])) * (fitglmnet$df)
# lambda_min_lasso <- fitglmnet$lambda[which.min(gicp)]
# plot(log(fitglmnet$lambda), gicp)
# fitglmnet$nulldev - deviance(fitglmnet) / this is the same as fitglmnet$nulldev*fitglmnet$dev.ratio
yhat_lasso <- predict(fitglmnet, newx = draw[["xtest_lasso"]])
cvmat <- apply((draw[["ytest"]] - yhat_lasso)^2, 2, mean)
lambda_min_lasso <- fitglmnet$lambda[which.min(cvmat)]
nz_names <- setdiff(rownames(coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F]),c("(Intercept)",paste0("PC",1:10)))
model_error <- l2norm(draw[["mu_train"]] -
draw[["xtrain"]] %*% coef(fitglmnet, s = lambda_min_lasso)[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
# defined in Bertsimas et al. 2016
# Best Subset Selection via a Modern Optimization Lens
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
# inidividual level predictions
yhat <- predict(fitglmnet, newx = draw[["xvalidate_lasso"]], s = lambda_min_lasso)
# yhat <- cbind(1, draw[["xtest"]]) %*% coef(fitglmnet, s = "lambda.min")[c("(Intercept)",colnames(draw[["xtest"]])),,drop = F]
# yhat_train <- cbind(1, draw[["xtrain"]]) %*% coef(fitglmnet, s = "lambda.min")[c("(Intercept)",colnames(draw[["xtrain"]])),,drop = F]
yhat_train <- predict(fitglmnet, newx = draw[["xtrain_lasso"]], s = lambda_min_lasso)
error_var <- l2norm(yhat_train - draw[["ytrain"]])^2 / (length(draw[["ytrain"]]) - (length(nz_names)+1+10)) # add back intercept and PCs
list(beta = coef(fitglmnet, s = lambda_min_lasso)[-1,,drop = F],
model_error = model_error,
prediction_error = prediction_error,
eta = NA,
sigma2 = NA,
yhat = yhat, # on the test set using principal components
nonzero = coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F],
nonzero_names = nz_names,
ytrain = draw[["ytrain"]],
ytest = draw[["ytest"]],
yvalidate = draw[["yvalidate"]],
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# lassoCV -----------------------------------------------------------------
# this uses train/validate split only
lassoCV <- new_method(name = "lassoCV", label = "Lasso with CV on training",
method = function(model, draw) {
fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain_lasso"]],
y = draw[["ytrain"]],
alpha = 1,
nlambda = 50,
standardize = FALSE,
penalty.factor = c(rep(1, ncol(draw[["xtrain"]])), rep(0,10)))
lambda_min_lasso <- fitglmnet$lambda.min
nz_names <- setdiff(rownames(coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F]),c("(Intercept)",paste0("PC",1:10)))
# PATCH December 4, 2019. this part is used to calculate TPR at a Fixed FPR of 5%
nonzero_names_alllambdas <- lapply(predict(fitglmnet, type = "nonzero", s = fitglmnet$lambda), function(i) {
# i = predict(fitglmnet, type = "nonzero", s = fitglmnet$lambda)$`50`
nz <- setdiff(colnames(draw[["xtrain_lasso"]][,i,drop = FALSE]), c("(Intercept)",paste0("PC",1:10)))
FP <- length(setdiff(nz, draw[["causal"]])) # false positives
TN <- length(draw[["not_causal"]]) # True negatives
FPR <- FP / (FP + TN)
FPR
})
FPRS <- do.call(cbind, nonzero_names_alllambdas)
index_of_FPR_at_5_percent <- which(abs(FPRS-0.05) == min(abs(FPRS-0.05)))[1] # take the first value in case theere are multiple mathces
actives <- setdiff(colnames(draw[["xtrain_lasso"]][,predict(fitglmnet, type = "nonzero", s = fitglmnet$lambda[index_of_FPR_at_5_percent])[,1],drop = FALSE]),
c("(Intercept)",paste0("PC",1:10)))
TPR_at_5_percent_FPR <- length(intersect(actives, draw[["causal"]]))/length(draw[["causal"]])
### END of PATCH
# this is design matrix of selected variables along with column of 1's for intercept
x_nonzero_train <- cbind(1,draw[["xtrain"]][, nz_names, drop = FALSE])
x_nonzero_validate <- cbind(1,draw[["xvalidate"]][, nz_names, drop = FALSE])
xtx <- crossprod(x_nonzero_train)
# add small ridge in case solution has more than n nonzeros:
diag(xtx) <- diag(xtx) + ifelse(ncol(x_nonzero_train)>nrow(x_nonzero_train),1e-4,0)
bb <- solve(xtx, crossprod(x_nonzero_train, draw[["ytrain"]]))
# this is on the validation set
yhat <- x_nonzero_validate %*% bb
yhat_train <- predict(fitglmnet, newx = draw[["xtrain_lasso"]], s = lambda_min_lasso)
error_var <- l2norm(yhat_train - draw[["ytrain"]])^2 / (length(draw[["ytrain"]]) - (length(nz_names)+1+10)) # add back intercept and PCs
# this is the lasso fitted betas
beta_orig <- beta_refit <- coef(fitglmnet, s = "lambda.min")[c(colnames(draw[["xtrain"]])),,drop = F]
# this is the refitted betas
beta_refit[match(rownames(bb)[-1], rownames(beta_orig)),,drop=FALSE] <- bb[-1]
# length(draw$beta)
# dim(beta_orig)
# l2norm(beta_orig-draw$beta)
# l2norm(beta_refit-draw$beta)
# defined in Bertsimas et al. 2016
# Best Subset Selection via a Modern Optimization Lens
model_error <- l2norm(draw[["mu_train"]] - x_nonzero_train %*% bb) # this uses the refitted betas
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
list(beta_refit = beta_refit,
beta_truth = draw[["beta"]],
model_error = model_error,
prediction_error = prediction_error,
eta = NA,
sigma2 = NA,
yhat = yhat, # on the test set using principal components
nonzero = coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F],
nonzero_names = nz_names,
TPR_at_5_percent_FPR = TPR_at_5_percent_FPR,
ACTIVES_at_5_percent_FPR = actives, # this should be used for number of active variables
ytrain = draw[["ytrain"]],
# ytest = draw[["ytest"]],
yvalidate = draw[["yvalidate"]],
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# lassonoPC ---------------------------------------------------------------
# this only uses PC in the training, but ignores for prediction
lassoNOPC <- new_method("lassoNOPC", "lassoNOPC",
method = function(model, draw) {
# draw <- draws@draws$r1.1
fitglmnet <- glmnet::glmnet(x = draw[["xtrain_lasso"]],
y = draw[["ytrain"]],
alpha = 1,
nlambda = 50,
standardize = FALSE,
penalty.factor = c(rep(1, ncol(draw[["xtrain"]])), rep(0,10)))
# yhat_lasso <- predict(fitglmnet, newx = draw[["xtest_lasso"]])
yhat_lasso <- cbind(1, draw[["xtest"]]) %*% coef(fitglmnet)[c("(Intercept)",colnames(draw[["xtest"]])),,drop = F]
cvmat <- apply((draw[["ytest"]] - yhat_lasso)^2, 2, mean)
lambda_min_lasso <- fitglmnet$lambda[which.min(cvmat)]
nz_names <- setdiff(rownames(coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F]),c("(Intercept)",paste0("PC",1:10)))
model_error <- l2norm(draw[["mu_train"]] -
draw[["xtrain"]] %*% coef(fitglmnet, s = lambda_min_lasso)[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
# defined in Bertsimas et al. 2016
# Best Subset Selection via a Modern Optimization Lens
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
# inidividual level predictions
# yhat <- predict(fitglmnet, newx = draw[["xvalidate_lasso"]], s = lambda_min_lasso)
yhat <- cbind(1, draw[["xvalidate"]]) %*% coef(fitglmnet, s = lambda_min_lasso)[c("(Intercept)",colnames(draw[["xvalidate"]])),,drop = F]
# yhat <- cbind(1, draw[["xtest"]]) %*% coef(fitglmnet, s = "lambda.min")[c("(Intercept)",colnames(draw[["xtest"]])),,drop = F]
# yhat_train <- cbind(1, draw[["xtrain"]]) %*% coef(fitglmnet, s = "lambda.min")[c("(Intercept)",colnames(draw[["xtrain"]])),,drop = F]
yhat_train <- predict(fitglmnet, newx = draw[["xtrain_lasso"]], s = lambda_min_lasso)
error_var <- l2norm(yhat_train - draw[["ytrain"]])^2 / (length(draw[["ytrain"]]) - (length(nz_names)+1+10)) # add back intercept and PCs
list(beta = coef(fitglmnet, s = lambda_min_lasso)[-1,,drop = F],
model_error = model_error,
prediction_error = prediction_error,
eta = NA,
sigma2 = NA,
yhat = yhat, # on the validation set without principal components
nonzero = coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F],
nonzero_names = nz_names,
ytrain = draw[["ytrain"]],
ytest = draw[["ytest"]],
yvalidate = draw[["yvalidate"]],
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# twostepYVC-with Cross Validation --------------------------------------------------------------
# this one uses the original y to compare to and stores the variance components (VC) (USE THIS ONE)
twostepYVCCV <- new_method("twostepYVCCV", "two step YVC with CV",
method = function(model, draw) {
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = model$kin)
# newy <- residuals(fit_lme)
# fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = newy, standardize = F, alpha = 1)
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
# browser()
x1 <- cbind(rep(1, nrow(draw[["xtrain"]])))
fit_lme <- gaston::lmm.aireml(draw[["ytrain"]], x1, K = draw[["kin_train"]])
gaston_resid <- draw[["ytrain"]] - (fit_lme$BLUP_omega + fit_lme$BLUP_beta)
fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]],
y = gaston_resid,
nlambda = 50,
standardize = FALSE,
alpha = 1,
intercept = T)
lambda_min_lasso <- fitglmnet$lambda.min
nz_names <- setdiff(rownames(coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F]),c("(Intercept)",paste0("PC",1:10)))
# PATCH December 4, 2019. this part is used to calculate TPR at a Fixed FPR of 5%
nonzero_names_alllambdas <- lapply(predict(fitglmnet, type = "nonzero", s = fitglmnet$lambda), function(i) {
# i = predict(fitglmnet, type = "nonzero", s = fitglmnet$lambda)$`50`
nz <- setdiff(colnames(draw[["xtrain_lasso"]][,i,drop = FALSE]), c("(Intercept)",paste0("PC",1:10)))
FP <- length(setdiff(nz, draw[["causal"]])) # false positives
TN <- length(draw[["not_causal"]]) # True negatives
FPR <- FP / (FP + TN)
FPR
})
FPRS <- do.call(cbind, nonzero_names_alllambdas)
index_of_FPR_at_5_percent <- which(abs(FPRS-0.05) == min(abs(FPRS-0.05)))[1] # take the first value in case theere are multiple mathces
actives <- setdiff(colnames(draw[["xtrain_lasso"]][,predict(fitglmnet, type = "nonzero", s = fitglmnet$lambda[index_of_FPR_at_5_percent])[,1],drop = FALSE]),
c("(Intercept)",paste0("PC",1:10)))
TPR_at_5_percent_FPR <- length(intersect(actives, draw[["causal"]]))/length(draw[["causal"]])
### END of PATCH
# this is design matrix of selected variables along with column of 1's for intercept
x_nonzero_train <- cbind(1, draw[["xtrain"]][, nz_names, drop = FALSE])
x_nonzero_validate <- cbind(1, draw[["xvalidate"]][, nz_names, drop = FALSE])
xtx <- crossprod(x_nonzero_train)
# add small ridge in case solution has more than n nonzeros:
diag(xtx) <- diag(xtx) + ifelse(ncol(x_nonzero_train)>nrow(x_nonzero_train),1e-4,0)
bb <- solve(xtx, crossprod(x_nonzero_train, draw[["ytrain"]]))
# draw$beta[which(draw$beta!=0)]
yhat <- x_nonzero_validate %*% bb
# l2norm(yhat-draw$yvalidate)
# inidividual level predictions
# yhat <- predict(fitglmnet, newx = draw[["xvalidate_lasso"]], s = lambda_min_lasso)
# yhat <- cbind(1, draw[["xtest"]]) %*% coef(fitglmnet, s = "lambda.min")[c("(Intercept)",colnames(draw[["xtest"]])),,drop = F]
# yhat_train <- cbind(1, draw[["xtrain"]]) %*% coef(fitglmnet, s = "lambda.min")[c("(Intercept)",colnames(draw[["xtrain"]])),,drop = F]
yhat_train <- predict(fitglmnet, newx = draw[["xtrain"]], s = lambda_min_lasso)
error_var <- l2norm(yhat_train - draw[["ytrain"]])^2 / (length(draw[["ytrain"]]) - (length(nz_names)+1+10)) # add back intercept and PCs
# defined in Bertsimas et al. 2016
# Best Subset Selection via a Modern Optimization Lens
model_error <- l2norm(draw[["mu_train"]] - x_nonzero_train %*% bb) # this uses the refitted betas
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
# this is the lasso fitted betas
beta_orig <- beta_refit <- coef(fitglmnet, s = "lambda.min")[c(colnames(draw[["xtrain"]])),,drop = F]
# this is the refitted betas
beta_refit[match(rownames(bb)[-1], rownames(beta_orig)),,drop=FALSE] <- bb[-1]
# l2norm(beta_orig - draw$beta)
# l2norm(beta_refit - draw$beta)
list(beta_refit = beta_refit,
beta_truth = draw[["beta"]],
yhat = yhat,
nonzero = coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop=F],
nonzero_names = nz_names,
TPR_at_5_percent_FPR = TPR_at_5_percent_FPR,
ACTIVES_at_5_percent_FPR = actives, # this should be used for number of active variables
model_error = model_error,
prediction_error = prediction_error,
eta = fit_lme$tau, # this is the VC for kinship
sigma2 = fit_lme$sigma2, # this is VC for error
ytrain = draw[["ytrain"]],
yvalidate = draw[["yvalidate"]],
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# twostepYVC --------------------------------------------------------------
# this one uses the original y to compare to and stores the variance components (VC)
# this also uses train/test/validate split
twostepYVC <- new_method("twostepYVC", "two step YVC",
method = function(model, draw) {
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = model$kin)
# newy <- residuals(fit_lme)
# fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = newy, standardize = F, alpha = 1)
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
x1 <- cbind(rep(1, nrow(draw[["xtrain"]])))
fit_lme <- gaston::lmm.aireml(draw[["ytrain"]], x1, K = draw[["kin_train"]])
gaston_resid <- draw[["ytrain"]] - (fit_lme$BLUP_omega + fit_lme$BLUP_beta)
fitglmnet <- glmnet::glmnet(x = draw[["xtrain"]], y = gaston_resid,
nlambda = 50,
standardize = FALSE, alpha = 1, intercept = T)
yhat_lasso <- predict(fitglmnet, newx = draw[["xtest"]])
cvmat <- apply((draw[["ytest"]] - yhat_lasso)^2, 2, mean)
lambda_min_lasso <- fitglmnet$lambda[which.min(cvmat)]
nz_names <- setdiff(rownames(coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop = F]),c("(Intercept)"))
model_error <- l2norm(draw[["mu_train"]] -
draw[["xtrain"]] %*% coef(fitglmnet, s = lambda_min_lasso)[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
yhat <- predict(fitglmnet, newx = draw[["xvalidate"]], s = lambda_min_lasso)
yhat_train <- predict(fitglmnet, newx = draw[["xtrain"]], s = lambda_min_lasso)
error_var <- l2norm(yhat_train - draw[["ytrain"]])^2 / (length(draw[["ytrain"]]) - (length(nz_names)+1)) # add back intercept
list(beta = coef(fitglmnet, s = lambda_min_lasso)[-1,,drop = F],
yhat = yhat,
nonzero = coef(fitglmnet, s = lambda_min_lasso)[glmnet::nonzeroCoef(coef(fitglmnet, s = lambda_min_lasso)),,drop=F],
nonzero_names = nz_names,
model_error = model_error,
prediction_error = prediction_error,
eta = fit_lme$tau, # this is the VC for kinship
sigma2 = fit_lme$sigma2, # this is VC for error
ytrain = draw[["ytrain"]],
ytest = draw[["ytest"]],
yvalidate = draw[["yvalidate"]],
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# ggmixed -----------------------------------------------------------------
# this use train/test/validate splits
ggmixed <- new_method("ggmix", "ggmix",
method = function(model, draw) {
# browser()
fit <- ggmix(x = draw[["xtrain"]],
y = draw[["ytrain"]],
kinship = draw[["kin_train"]],
verbose = 0,
nlambda = 50,
# dfmax = 500,
eta_init = runif(1, min = 0.05, max = 0.95))
# hdbic <- gic(fit, an = log(length(draw[["ytrain"]])))
# hdbic <- gic(fit)
# plot(hdbic)
# sum(setdiff(rownames(coef(hdbic, type = "nonzero")), c("(Intercept)","eta","sigma2")) %in%
# draw[["causal"]])
# hdbic$lambda.min
predmat <- predict(fit,
newx = draw[["xtest"]],
type = "individual",
covariance = draw[["kin_test_train"]],
s = fit$lambda)
cvmat <- apply((draw[["ytest"]] - predmat)^2, 2, mean)
lambda_min_ggmix <- fit$result[which.min(cvmat), "Lambda"]
model_error <- l2norm(draw[["mu_train"]] -
draw[["xtrain"]] %*% predict(fit, s = lambda_min_ggmix, type = "coef")[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
# inidividual level prediction
yhat <- predict(fit,
s = lambda_min_ggmix,
newx = draw[["xvalidate"]],
type = "individual",
covariance = draw[["kin_validate_train"]])
# yhat_hdbic <- predict(fit,
# s = hdbic$lambda.min,
# newx = draw[["xvalidate"]],
# type = "individual",
# covariance = draw[["kin_validate_train"]])
#
# l2norm(yhat-draw[["yvalidate"]])
# l2norm(yhat_hdbic-draw[["yvalidate"]])
# mse_value <- crossprod(predict(hdbic, newx = draw[["xtrain"]]) + ranef(hdbic) - draw[["ytrain"]]) / length(draw[["ytrain"]])
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
list(beta = predict(fit, s = lambda_min_ggmix, type = "coef")[2:(ncol(draw[["xtrain"]]) + 1),,drop = F], #this doesnt have intercept and is a 1-col matrix
model_error = model_error,
prediction_error = prediction_error,
nonzero = coef(fit, type = "nonzero", s = lambda_min_ggmix),
nonzero_names = setdiff(rownames(coef(fit, type = "nonzero", s = lambda_min_ggmix)), c("(Intercept)","eta","sigma2")),
# yhat = predict(hdbic, newx = draw[["xtrain"]]) + ranef(hdbic),
yhat = yhat,
ytrain = draw[["ytrain"]],
ytest = draw[["ytest"]],
yvalidate = draw[["yvalidate"]],
eta = coef(fit, type = "nonzero", s = lambda_min_ggmix)["eta",],
sigma2 = coef(fit, type = "nonzero", s = lambda_min_ggmix)["sigma2",],
error_variance = (1 - coef(fit, type = "nonzero", s = lambda_min_ggmix)["eta",]) * coef(fit, type = "nonzero", s = lambda_min_ggmix)["sigma2",],
y = draw[["ytrain"]],
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# ggmixHDBIC --------------------------------------------------------------
# this use HDBIC to select best model from training set only, and then predicts on validation set.
ggmixedHDBIC <- new_method("ggmixHDBIC", "ggmixHDBIC",
method = function(model, draw) {
fit <- ggmix(x = draw[["xtrain"]],
y = draw[["ytrain"]],
kinship = draw[["kin_train"]],
verbose = 0,
nlambda = 50,
eta_init = runif(1, min = 0.05, max = 0.95))
hdbic <- gic(fit)
nonzero_names <- setdiff(rownames(coef(fit, type = "nonzero", s = hdbic$lambda.min)), c("(Intercept)","eta","sigma2"))
# PATCH December 4, 2019. this part is used to calculate TPR at a Fixed FPR of 5%
nonzero_names_alllambdas <- lapply(coef(fit, type = "nonzero"), function(i) {
nz <- setdiff(rownames(i), c("(Intercept)","eta","sigma2"))
FP <- length(setdiff(nz, draw[["causal"]])) # false positives
TN <- length(draw[["not_causal"]]) # True negatives
FPR <- FP / (FP + TN)
FPR
})
FPRS <- do.call(cbind, nonzero_names_alllambdas)
index_of_FPR_at_5_percent <- which(abs(FPRS-0.05) == min(abs(FPRS-0.05)))[1] # take the first value in case theere are multiple mathces
actives <- setdiff(rownames(coef(fit, type = "nonzero", s = fit$lambda[index_of_FPR_at_5_percent])), c("(Intercept)","eta","sigma2"))
TPR_at_5_percent_FPR <- length(intersect(actives, draw[["causal"]]))/length(draw[["causal"]])
### END of PATCH
# this is design matrix of selected variables along with column of 1's for intercept
x_nonzero_train <- cbind(1,draw[["xtrain"]][, nonzero_names, drop = FALSE])
x_nonzero_validate <- cbind(1,draw[["xvalidate"]][, nonzero_names, drop = FALSE])
xtx <- crossprod(x_nonzero_train)
# add small ridge in case solution has more than n nonzeros:
diag(xtx) <- diag(xtx) + ifelse(ncol(x_nonzero_train)>nrow(x_nonzero_train),1e-4,0)
bb <- solve(xtx, crossprod(x_nonzero_train, draw[["ytrain"]]))
# this gives same result
# cbind(lm.fit(x = x_nonzero_train, y = draw[["ytrain"]])$coef)
# this is on validation set
yhat <- x_nonzero_validate %*% bb
model_error <- l2norm(draw[["mu_train"]] - x_nonzero_train %*% bb)
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
# this is the ggmix fitted betas
beta_orig <- beta_refit <- coef(hdbic)[-1,,drop=F]
# this is the refitted betas
beta_refit[match(rownames(bb)[-1], rownames(beta_orig)),,drop=FALSE] <- bb[-1]
# some checks to makes sure everything lines up
# all(rownames(beta_refit)==colnames(draw$xtrain))
# l2norm(draw$beta - beta_refit)
# plot(draw$beta,beta_refit)
# abline(a=0, b=1)
# beta_refit[match(rownames(bb)[-1], rownames(beta_orig)),,drop=FALSE]
# beta_orig[match(rownames(bb)[-1], rownames(beta_orig)),,drop=FALSE]
# colnames(draw$xtrain)[match(draw$causal, colnames(draw$xtrain))]
# cbind(colnames(draw$xtrain)[match(draw$causal, colnames(draw$xtrain))],draw$beta[match(draw$causal, colnames(draw$xtrain))])
# draw$beta
# all(rownames(coef(hdbic))[-1] == colnames(draw$xtrain))
list(beta_refit = beta_refit, # this doesnt have intercept and is a 1-col matrix of refitted betas using OLS
beta_truth = draw[["beta"]],
model_error = model_error,
prediction_error = prediction_error,
nonzero = coef(fit, type = "nonzero", s = hdbic$lambda.min),
nonzero_names = nonzero_names,
TPR_at_5_percent_FPR = TPR_at_5_percent_FPR,
ACTIVES_at_5_percent_FPR = actives, # this should be used for number of active variables
yhat = yhat,
ytrain = draw[["ytrain"]],
yvalidate = draw[["yvalidate"]],
eta = coef(fit, type = "nonzero", s = hdbic$lambda.min)["eta",],
sigma2 = coef(fit, type = "nonzero", s = hdbic$lambda.min)["sigma2",],
error_variance = (1 - coef(fit, type = "nonzero", s = hdbic$lambda.min)["eta",]) * coef(fit, type = "nonzero", s = hdbic$lambda.min)["sigma2",],
y = draw[["ytrain"]],
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# ggmixedHDBICLMM ---------------------------------------------------------
ggmixedHDBICLMM <- new_method("ggmixHDBICLMM", "ggmixHDBIC with LMM for prediction",
method = function(model, draw) {
# browser()
fit <- ggmix(x = draw[["xtrain"]],
y = draw[["ytrain"]],
kinship = draw[["kin_train"]],
verbose = 2,
nlambda = 50,
# dfmax = 500,
eta_init = runif(1, min = 0.05, max = 0.95))
# hdbic <- gic(fit, an = log(length(draw[["ytrain"]])))
hdbic <- gic(fit)
# plot(hdbic)
# sum(setdiff(rownames(coef(hdbic, type = "nonzero")), c("(Intercept)","eta","sigma2")) %in%
# draw[["causal"]])
# hdbic$lambda.min
# predmat <- predict(fit,
# newx = draw[["xtest"]],
# type = "individual",
# covariance = draw[["kin_test_train"]],
# s = fit$lambda)
# cvmat <- apply((draw[["ytest"]] - predmat)^2, 2, mean)
# lambda_min_ggmix <- fit$result[which.min(cvmat), "Lambda"]
# inidividual level prediction
# yhat <- predict(fit,
# s = lambda_min_ggmix,
# newx = draw[["xvalidate"]],
# type = "individual",
# covariance = draw[["kin_validate_train"]])
#
# yhat <- predict(fit,
# s = hdbic$lambda.min,
# newx = draw[["xvalidate"]],
# type = "individual",
# covariance = draw[["kin_validate_train"]])
# l2norm(yhat-draw[["yvalidate"]])
# l2norm(yhat_hdbic-draw[["yvalidate"]])
# mse_value <- crossprod(predict(hdbic, newx = draw[["xtrain"]]) + ranef(hdbic) - draw[["ytrain"]]) / length(draw[["ytrain"]])
nonzero_names <- setdiff(rownames(coef(fit, type = "nonzero", s = hdbic$lambda.min)), c("(Intercept)","eta","sigma2"))
# this is design matrix of selected variables along with column of 1's for intercept
x_nonzero_train <- cbind(1,draw[["xtrain"]][, nonzero_names, drop = FALSE])
x_nonzero_validate <- cbind(1,draw[["xvalidate"]][, nonzero_names, drop = FALSE])
# x1 <- cbind(rep(1, nrow(draw[["xtrain"]])))
fit_lme <- gaston::lmm.aireml(Y = draw[["ytrain"]], X = x_nonzero_train, K = draw[["kin_train"]])
# eta
eta_fit <- fit_lme$tau / (fit_lme$tau + fit_lme$sigma2)
#sigma2
# fit_lme$sigma2
# yhat <- fit_lme$tau * draw[["kin_validate_train"]] %*% fit_lme$Py
# fit_lme$BLUP_beta %>% length()
# ncol(x_nonzero_train)
yhat <- x_nonzero_validate %*% fit_lme$BLUP_beta +
ggmix:::bi_future_lassofullrank(
eta = eta_fit,
# eta = coef(fit, type = "nonzero", s = hdbic$lambda.min)["eta",],
beta = as.matrix(fit_lme$BLUP_beta),
eigenvalues = fit[["ggmix_object"]][["D"]],
eigenvectors = fit[["ggmix_object"]][["U"]],
x = cbind(1, fit[["ggmix_object"]][["x"]][,nonzero_names, drop = FALSE]), # these are the transformed x
y = fit[["ggmix_object"]][["y"]], # these are the transformed y
covariance = draw[["kin_validate_train"]])
model_error <- l2norm(draw[["mu_train"]] -
x_nonzero_train %*% as.matrix(fit_lme$BLUP_beta))
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
# xtx <- crossprod(x_nonzero_train)
# # add small ridge in case solution has more
# # than n nonzeros:
# diag(xtx) <- diag(xtx) + 1e-4
# bb <- solve(xtx,
# crossprod(x_nonzero_train, draw[["ytrain"]]))
# # draw$beta[which(draw$beta!=0)]
# yhat <- x_nonzero_validate %*% bb
# # l2norm(yhat-draw$yvalidate)
list(beta = predict(fit, s = hdbic$lambda.min, type = "coef")[2:(ncol(draw[["xtrain"]]) + 1),,drop = F], #this doesnt have intercept and is a 1-col matrix
model_error = model_error,
prediction_error = prediction_error,
nonzero = coef(fit, type = "nonzero", s = hdbic$lambda.min),
nonzero_names = nonzero_names,
# yhat = predict(hdbic, newx = draw[["xtrain"]]) + ranef(hdbic),
yhat = yhat,
ytrain = draw[["ytrain"]],
ytest = draw[["ytest"]],
yvalidate = draw[["yvalidate"]],
eta = eta_fit,
sigma2 = fit_lme$sigma2,
error_variance = (1 - eta_fit) * fit_lme$sigma2,
y = draw[["ytrain"]],
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
refit <- new_method_extension(name = "refit", label = "refitted",
method_extension = function(model, draw, out,
base_method) {
beta <- apply(out$beta, 2, function(b) {
ii <- b != 0
if (sum(ii) == 0)
return(b)
xtx <- crossprod(model$x[, ii])
# add small ridge in case solution has more
# than n nonzeros:
diag(xtx) <- diag(xtx) + 1e-4
bb <- solve(xtx,
crossprod(model$x[, ii], draw))
b[ii] <- bb
return(b)
})
return(list(beta = beta,
yhat = model$x %*% beta,
lambda = out$lambda,
df = rep(NA, ncol(beta))))
})
###%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%###########################################
# this one uses residuals to compare to (DO NOT USE)
twostep <- new_method("twostep", "two step",
method = function(model, draw) {
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = model$kin)
# newy <- residuals(fit_lme)
# fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = newy, standardize = F, alpha = 1)
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
x1 <- cbind(rep(1, nrow(draw[["xtrain"]])))
fit_lme <- gaston::lmm.aireml(draw[["ytrain"]], x1, K = draw[["kin"]])
gaston_resid <- draw[["ytrain"]] - (fit_lme$BLUP_omega + fit_lme$BLUP_beta)
fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = gaston_resid,
standardize = T, alpha = 1, intercept = T)
nz_names <- setdiff(rownames(coef(fitglmnet, s = "lambda.min")[glmnet::nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop = F]),c("(Intercept)"))
model_error <- l2norm(draw[["mutrain"]] -
draw[["xtrain"]] %*% coef(fitglmnet, s = "lambda.min")[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
prediction_error <- model_error^2 / l2norm(draw[["mutrain"]])^2
# this should be compared with gaston_resid
yhat <- predict(fitglmnet, newx = draw[["xtrain"]], s = "lambda.min")
error_var <- l2norm(yhat - gaston_resid)^2 / (length(draw[["ytrain"]]) - length(nz_names))
list(beta = coef(fitglmnet, s = "lambda.min")[-1,,drop = F],
yhat = yhat,
nonzero = coef(fitglmnet, s = "lambda.min")[glmnet::nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop=F],
nonzero_names = nz_names,
model_error = model_error,
prediction_error = prediction_error,
eta = NA,
sigma2 = NA,
y = gaston_resid,
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# this one uses the original y to compare to (DO nOT USE)
twostepY <- new_method("twostepY", "two step Y",
method = function(model, draw) {
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = model$kin)
# newy <- residuals(fit_lme)
# fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = newy, standardize = F, alpha = 1)
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
x1 <- cbind(rep(1, nrow(draw[["xtrain"]])))
fit_lme <- gaston::lmm.aireml(draw[["ytrain"]], x1, K = draw[["kin_train"]])
gaston_resid <- draw[["ytrain"]] - (fit_lme$BLUP_omega + fit_lme$BLUP_beta)
fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = gaston_resid,
standardize = T, alpha = 1, intercept = T)
nz_names <- setdiff(rownames(coef(fitglmnet, s = "lambda.min")[glmnet::nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop = F]),c("(Intercept)"))
model_error <- l2norm(draw[["mu_train"]] -
draw[["xtrain"]] %*% coef(fitglmnet, s = "lambda.min")[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
prediction_error <- model_error^2 / l2norm(draw[["mu_train"]])^2
yhat <- predict(fitglmnet, newx = draw[["xtest"]], s = "lambda.min")
yhat_train <- predict(fitglmnet, newx = draw[["xtrain"]], s = "lambda.min")
error_var <- l2norm(yhat_train - draw[["ytrain"]])^2 / (length(draw[["ytrain"]]) - length(nz_names))
list(beta = coef(fitglmnet, s = "lambda.min")[-1,,drop = F],
yhat = yhat,
nonzero = coef(fitglmnet, s = "lambda.min")[glmnet::nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop=F],
nonzero_names = nz_names,
model_error = model_error,
prediction_error = prediction_error,
eta = NA,
sigma2 = NA,
y = draw[["ytrain"]],
ytest = draw[["ytest"]],
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
# this one uses residuals to compare to and stores the variance components (VC) (DO NOT USE)
twostepVC <- new_method("twostep", "two step",
method = function(model, draw) {
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = model$kin)
# newy <- residuals(fit_lme)
# fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = newy, standardize = F, alpha = 1)
# pheno_dat <- data.frame(Y = draw, id = rownames(model$kin))
x1 <- cbind(rep(1, nrow(draw[["xtrain"]])))
fit_lme <- gaston::lmm.aireml(draw[["ytrain"]], x1, K = draw[["kin"]])
gaston_resid <- draw[["ytrain"]] - (fit_lme$BLUP_omega + fit_lme$BLUP_beta)
fitglmnet <- glmnet::cv.glmnet(x = draw[["xtrain"]], y = gaston_resid,
standardize = T, alpha = 1, intercept = T)
nz_names <- setdiff(rownames(coef(fitglmnet, s = "lambda.min")[glmnet::nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop = F]),c("(Intercept)"))
model_error <- l2norm(draw[["mutrain"]] -
draw[["xtrain"]] %*% coef(fitglmnet, s = "lambda.min")[2:(ncol(draw[["xtrain"]]) + 1),,drop = F])
prediction_error <- model_error^2 / l2norm(draw[["mutrain"]])^2
# this should be compared with gaston_resid
yhat <- predict(fitglmnet, newx = draw[["xtrain"]], s = "lambda.min")
error_var <- l2norm(yhat - gaston_resid)^2 / (length(draw[["ytrain"]]) - length(nz_names))
list(beta = coef(fitglmnet, s = "lambda.min")[-1,,drop = F],
yhat = yhat,
nonzero = coef(fitglmnet, s = "lambda.min")[glmnet::nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop=F],
nonzero_names = nz_names,
model_error = model_error,
prediction_error = prediction_error,
eta = fit_lme$tau, # this is the VC for kinship
sigma2 = fit_lme$sigma2, # this is VC for error
y = gaston_resid,
error_variance = error_var,
causal = draw[["causal"]],
not_causal = draw[["not_causal"]],
p = ncol(draw[["xtrain"]])
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.