penfam: Variable Selection in Linear Mixed Models for SNP Data

######################################
# R Source code file for simulation of ggmix paper
# Notes:
# This is the main simulator file
# I ran this on tmux by copy pasting into an rsession on hydra.
# parallel doesnt work in rstudio
# use this code to save the results. turns out you cant run simulator in parallel "by hand"
# you need to load the simulator object prior to adding new simulations to the object.
#
# Since the simulations take a long time to run, I save the results to a .rds file
# and then create figures based on that
# Author: Sahir Bhatnagar
# Created: 2018
# Updated: Dec 4, 2019
# To address referee comments. Need to add simulation where n is approx equal to p
#####################################



# This is the main simulator file
# rm(list = ls())

# setwd("/home/sahir/git_repositories/ggmix/simulation/")
pacman::p_load(simulator) # this file was created under simulator version 0.2.0
source("/home/sahir/git_repositories/ggmix/simulation/model_functions.R")
source("/home/sahir/git_repositories/ggmix/simulation/method_functions.R")
source("/home/sahir/git_repositories/ggmix/simulation/eval_functions.R")

# source("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/ggmix/simulation/model_functions.R")
# source("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/ggmix/simulation/method_functions.R")
# source("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/ggmix/simulation/eval_functions.R")

## @knitr init

# name_of_simulation <- "thesis-ggmix-july1" this had only one eta value
# name_of_simulation <- "thesis-ggmix-july3" # this has more than 1 eta value 0.1,0.2,0.3,0.4
# name_of_simulation <- "thesis-ggmix-july12" # this has percent causal 0,0.01, and eta=0.1, 0.5
# name_of_simulation <- "ggmix-mar5" # this has percent causal 0,0.01, and eta=0.1, 0.5
# name_of_simulation <- "ggmix-apr29" # this has train/test split 50/50 split
# name_of_simulation <- "ggmix-may7" # this has train/test/validation split 60/20/20 split
# name_of_simulation <- "ggmix-jul8" # this has train/test/validation split 60/20/20 split but with dfmax not specified in ggmix
# name_of_simulation <- "ggmix-jul10" # this has train/validation split 80/20 split and HDBIC for ggmix ,with lm on active snps for prediction, and TPR at fixed FPR of 5%

## @knitr main


# I used this on Decemerb 4th 2019, because I wanted to re-use the simulated data,
# but apply the three methods with the TPR calculated at a fixed TPR of 5%
# For the n=p simulation, I need to create a new simulatior object. See below.
sim <- simulator::load_simulation(name_of_simulation, dir = "simulation/")
# sim <- sim %>%
#   run_method(list(twostepYVCCV, lassoCV, ggmixedHDBIC),
#              # run_method(list(lasso, ggmixed, twostepYVC, lassoNOPC),
#              parallel = list(socket_names = 40,
#                              libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2, tprFPR5, nactiveFPR5,
#                 correct_sparsity,mse, errorvariance, estimationerror))
# save_simulation(sim)
# as.data.frame(evals(sim))
# ls()
# # nsim needs to be at least 2
#
# # sim <- new_simulation(name_of_simulation, "Thesis 2018", dir = "simulation/") %>%
# #   generate_model(make_mixed_model_SSC, b0 = 0, sigma2 = list(2, 4),
# #                  eta = list(0.1, 0.5),
# #                  percent_causal = 1,
# #                  beta_mean = list(0.2, 0.6, 1),
# #                  percent_overlap = list("0","100"),
# #                  vary_along = c("sigma2","eta","beta_mean","percent_overlap")) %>%
# #   simulate_from_model(nsim = 4, index = 1:50) %>%
# #   run_method(list(lasso, ggmix, twostep),
# #              parallel = list(socket_names = 35,
# #                              libraries = c("glmnet","magrittr","MASS","progress","Matrix","coxme","gaston")))
# #
# # sim <- sim %>% evaluate(list(modelerror, tpr, fpr, nactive, eta, sigma2))
# # save_simulation(sim)
# # sim <- new_simulation(name_of_simulation, "jul-10", dir = "simulation/") %>%
#
# sim <- new_simulation(name_of_simulation, "dec-4", dir = "simulation/") %>%
#   generate_model(make_ADmixed_model_train_validate,
#                  b0 = 1,
#                  sigma2 = 1,
#                  beta_mean = 1,
#                  k = 10,
#                  s = 0.5,
#                  Fst = 0.1,
#                  geography = "1d",
#                  n = 1000, # 80/20 split
#                  p_design = 5000,
#                  p_kinship = 10000,
#                  percent_causal = list(0, 0.01),
#                  percent_overlap = list("0","100"),
#                  eta = list(0.1, 0.3),
#                  vary_along = c("percent_overlap","percent_causal","eta")
#
#                  # n = 1000, # 80/20 split
#                  # p_design = 5000,
#                  # p_kinship = 10000,
#                  # percent_causal = 0.01,
#                  # percent_overlap = "0",
#                  # eta = 0.1#,
#                  # vary_along = c("percent_overlap","percent_causal","eta")
#
#                  # n = 500, # 60/20/20 split
#                  # p_design = 500,
#                  # p_kinship = 1000
#                  # eta = 0.50,
#                  # geography = "1d",
#                  # percent_causal = 0,
#                  # percent_overlap = "0",
#                  # n = 800, # 50% train, 50% test
#                  # p_design = 500,
#                  # p_kinship = 1000
#
#   ) %>%
#   # simulate_from_model(nsim = 5, index = 1:40) %>%
#   simulate_from_model(nsim = 5, index = 1:40) %>%
#   run_method(list(twostepYVCCV, lassoCV, ggmixedHDBIC),
#              # run_method(list(lasso, ggmixed, twostepYVC, lassoNOPC),
#              parallel = list(socket_names = 40,
#                              libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2, tprFPR5, nactiveFPR5,
#                 correct_sparsity,mse, errorvariance, estimationerror))
# save_simulation(sim)
# as.data.frame(evals(sim))
# ls()



# n=p simulation ----------------------------------------------------------
# December 5th, 2019
# this has train/validation split 80/20 split and HDBIC for ggmix ,with lm on active snps for prediction, and n = p simulation for referee comments
name_of_simulation <- "ggmix-dec5"
sim <- new_simulation(name_of_simulation, "dec-5", dir = "simulation/") %>%
  generate_model(make_ADmixed_model_train_validate,
                 b0 = 1,
                 sigma2 = 1,
                 beta_mean = 1,
                 k = 10,
                 s = 0.5,
                 Fst = 0.1,
                 geography = "1d",
                 n = 1000, # 80/20 split
                 p_design = 5000,
                 p_kinship = 1000,
                 percent_causal = list(0, 0.01),
                 percent_overlap = list("0","100"),
                 eta = list(0.1, 0.3),
                 vary_along = c("percent_overlap","percent_causal","eta")

                 # n = 1000, # 80/20 split
                 # p_design = 5000,
                 # p_kinship = 10000,
                 # percent_causal = 0.01,
                 # percent_overlap = "0",
                 # eta = 0.1#,
                 # vary_along = c("percent_overlap","percent_causal","eta")

                 # n = 500, # 60/20/20 split
                 # p_design = 500,
                 # p_kinship = 1000
                 # eta = 0.50,
                 # geography = "1d",
                 # percent_causal = 0,
                 # percent_overlap = "0",
                 # n = 800, # 50% train, 50% test
                 # p_design = 500,
                 # p_kinship = 1000

  ) %>%
  # simulate_from_model(nsim = 5, index = 1:40) %>%
  simulate_from_model(nsim = 5, index = 1:40) %>%
  run_method(list(twostepYVCCV, lassoCV, ggmixedHDBIC),
             # run_method(list(lasso, ggmixed, twostepYVC, lassoNOPC),
             parallel = list(socket_names = 40,
                             libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
  evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2, tprFPR5, nactiveFPR5,
                correct_sparsity,mse, errorvariance, estimationerror))
save_simulation(sim)
as.data.frame(evals(sim))
ls()


sim <- simulator::load_simulation(name_of_simulation, dir = "simulation/")
df <- as.data.frame(evals(sim))
saveRDS(df, file = "simulation/simulation_results/dec_5_2019_results.rds") # this has train/validate split only. and doing re-fit for MSE and HDBIC for ggmix and TPR at FPR 5 % and n=1000=kinship=1000

# make sure most recent version of ggmix is installed for the parallel code to work!!!!#!#@$!@$!@$
# remotes::install_github('sahirbhatnagar/ggmix', ref = "validate")

# ggmixed@method(draw = draws(sim)@draws$r1.2)
#
# tt <- ggmixed@method(draw = draws(sim)@draws$r1.1)
# draws(sim)@draws$r1.2$xtrain
# ls()
# tt$causal
# tt$nonzero
# sim <- sim %>% run_method(list(lasso, ggmixed))
# tp <- simulator::load_draws(dir = "simulation/", model_name = "ggmix_05_07_2019")
# sim <- sim %>% run_method(list(lasso, ggmixed))#,#, twostep, twostepY),
#              parallel = list(socket_names = 8,
#                              libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))

# save_simulation(sim)
# sim <- sim %>% evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2,
#                              correct_sparsity,mse, errorvariance))
# save_simulation(sim)
# as.data.frame(evals(sim))
# ls()
#
# sim <- sim %>% run_method(list(twostepYVC),
#                   parallel = list(socket_names = 40,
#                                   libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2,
#                 correct_sparsity,mse, errorvariance))
#
# sim <- sim %>% run_method(list(lasso, lassoNOPC, ggmixed),
#                           parallel = list(socket_names = 40,
#                                           libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2,
#                 correct_sparsity,mse, errorvariance))
#
# save_simulation(sim)
# ls()
#
# sim <- load_simulation(name = name_of_simulation, dir = "/home/sahir/git_repositories/ggmix/simulation/")
# sim %>% subset_simulation(methods = c("lasso","ggmix")) %>% plot_eval("mse")
# sim %>% plot_eval("tprFPR5")
# sim %>% plot_eval("nactiveFPR5")
# sim2 <- sim %>% subset_simulation(percent_overlap == "100" & percent_causal == 0.01 & eta == 0.10)
# name_of_simulation <- "ggmix-jul8"
# label = "jul-8"
# sim2 <- simulator::relabel(sim2, label = label)
# sim2 <- simulator::rename(sim2, name = name_of_simulation)
# save_simulation(sim2)
# sim2

# name_of_simulation <- "ggmix-jul8"
# sim2 <- load_simulation(name = name_of_simulation, dir = "/home/sahir/git_repositories/ggmix/simulation/")
#
# sim2 <- sim2 %>% run_method(list(lasso, lassoNOPC, ggmixed),
#                           parallel = list(socket_names = 40,
#                                           libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2,
#                 correct_sparsity,mse, errorvariance))
#
# save_simulation(sim2)
# ls()

rm(sim)
sim <- load_simulation(name = name_of_simulation, dir = "/home/sahir/git_repositories/ggmix/simulation/")
# sim <- sim %>%
#   run_method(list(ggmixedHDBIC),
#              parallel = list(socket_names = 40,
#                              libraries = c("glmnet","magrittr","MASS","Matrix","coxme","gaston","ggmix","popkin","bnpsd"))) %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2,
#                 correct_sparsity,mse, errorvariance))
# save_simulation(sim)
# as.data.frame(evals(sim))
# ls()

# sim %>% subset_simulation(methods = c("lasso","ggmix","lassoNOPC","ggmixHDBIC","lassoCV","twostepYVCCV")) %>% plot_eval("mse")
p1 <- sim %>% plot_eval("mse")

sim %>% subset_simulation(methods = c("lasso","ggmix","lassoNOPC","ggmixHDBIC")) %>% plot_eval("fpr")
sim %>% subset_simulation(methods = c("lasso","ggmix","lassoNOPC","ggmixHDBIC")) %>% plot_eval("fpr")
sim %>% subset_simulation(methods = c("lasso","ggmix","lassoNOPC","ggmixHDBIC","lassoCV")) %>% plot_eval("correct_sparsity")
sim %>% plot_eval("tpr")
sim %>% plot_eval("fpr")
sim %>% subset_simulation(methods = c("ggmix","ggmixHDBIC","lassoCV")) %>% plot_eval("errorvar")
sim %>% plot_eval("eta")
sim %>% plot_eval("time")
sim %>% plot_eval("nactive")
sim %>% tabulate_eval("nactive")
sim %>% plot_eval("me")
sim %>% subset_simulation(methods = c("lasso","ggmix","lassoNOPC","ggmixHDBIC")) %>% plot_eval("prederror")
sim %>% subset_simulation(methods = c("ggmix","ggmixHDBIC")) %>% plot_eval("eta")
sim %>% subset_simulation(methods = c("ggmix","ggmixHDBIC")) %>% plot_eval("errorvar")
sim %>% subset_simulation(methods = c("ggmix","ggmixHDBIC")) %>% plot_eval("sigma2")
sim %>% plot_eval("estimationerror")

plot_eval(sim, "tpr")
plot_eval(sim, "fpr")
plot_eval(sim, "correct_sparsity")

# save results ------------------------------------------------------------
sim <- load_simulation(name = name_of_simulation, dir = "/home/sahir/git_repositories/ggmix/simulation/")
df <- as.data.frame(evals(sim))
# sim %>% subset_simulation(methods = c("ggmix","lasso"))
# saveRDS(df, file = "simulation/simulation_results/may_02_2019_results.rds")
# saveRDS(df, file = "simulation/simulation_results/may_05_2019_results.rds") # this has lasso1se
# saveRDS(df, file = "simulation/simulation_results/may_06_2019_results.rds") # this has lasso1se + proper variance components for twostep, but im not using lasso1se
saveRDS(df, file = "simulation/simulation_results/jul_10_2019_results_v2.rds") # this has train/validate split only. and doing re-fit for MSE and HDBIC for ggmix and TPR at FPR 5 %
df %>% filter(Method=="twostepYVCCV")

simulator::tabulate_eval(sim, "mse")

# sim <- sim %>%
#   run_method(list(lasso, ggmixed, twostepY),
#              parallel = list(socket_names = 20,
#                              libraries = c("glmnet","magrittr","MASS","Matrix",
#                                            "coxme","gaston","popkin","bnpsd", "ggmix")))
# save_simulation(sim)
# sim <- sim %>%
#   evaluate(list(modelerror, prederror,tpr, fpr, nactive, eta, sigma2,
#                 correct_sparsity,mse, errorvariance))
# save_simulation(sim)
# as.data.frame(evals(sim))
# ls()

# ns <- seq(500,4000, by = 500)
# res <- vector("numeric", length = length(ns))

# for(jj in seq_along(ns)) {
# dat <- make_ADmixed_model_not_sim(b0 = 0, sigma2 = 1,
#                                   eta = 0.1,
#                                   # n = ns[jj],
#                                   n = 1000,
#                                   p_test = 1000,
#                                   beta_mean = 0.5,
#                                   # p_test = 500,
#                                   p_kinship = 10000,
#                                   geography = "ind",
#                                   # geography = "circ",
#                                   percent_causal = 0.01,
#                                   percent_overlap = "0",
#                                   # percent_overlap = "100",
#                                   k = 5, s = 0.5, Fst = 0.1)
#
# pheno_dat <- data.frame(Y = dat$y, id = paste0("ID",1:length(dat$y)))
# x1 <- cbind(rep(1, nrow(dat$x)))
# fit <- gaston::lmm.aireml(dat$y, x1, K = dat$kin)
# kin <- gaston::as.bed.matrix(dat$Xkinship)
# gaston::standardize(kin) <- "p"
# kin[1:5,1:5]
# kins <- gaston::GRM(kin)
# dim(kins)
# kins <- crossprod(dat$Xkinship)/ncol(dat$Xkinship)
# dim(kins)
# kins[1:5,1:5]
# popkin::plotPopkin(list(dat$kin, kins))
# fit <- gaston::lmm.aireml(dat$y, x1, K = kins)
#
# res[jj] <- fit$tau




# simdata -----------------------------------------------------------------
devtools::load_all()

library(ggmix)
library(glmnet)
library(gaston)
data("admixed")

## ---- ggmix ----
fit_ggmix <- ggmix(x = admixed$x, y = admixed$y, kinship = admixed$kin, verbose = 1)
bicGGMIX <- gic(fit_ggmix, an = log(length(admixed$y)))
yhat_ggmix <- predict(bicGGMIX, newx = admixed$x)
RMSE_ggmix <- l2norm(yhat_ggmix - admixed$y)

## ---- two-step ----
x1 <- cbind(rep(1, nrow(admixed$x)))
fit_lme <- gaston::lmm.aireml(Y = admixed$y, X = x1, K = admixed$kin)
gaston_resid <- admixed$y - (fit_lme$BLUP_omega + fit_lme$BLUP_beta)
fitglmnet <- glmnet::cv.glmnet(x = admixed$x, y = gaston_resid,
                               standardize = T, alpha = 1, intercept = T)
yhat_twostep <- predict(fitglmnet, newx = admixed$x, s = "lambda.min")
RMSE_twostep <- l2norm(yhat_twostep - admixed$y)

## ---- lasso ----
fit_glmnet <- cv.glmnet(x = admixed$x_lasso, y = admixed$y,
                        alpha = 1, standardize = T,
                        penalty.factor = c(rep(1, ncol(admixed$x)), rep(0,10)))
# extract only betas for SNPs
betas_lasso <- coef(fit_glmnet, s = "lambda.min")[1:(ncol(admixed$x)+1), , drop = F]
yhat_lasso <- cbind(1, admixed$x) %*% betas_lasso
RMSE_lasso <- l2norm(yhat_lasso - admixed$y)

# karim data --------------------------------------------------------------






pacman::p_load(gaston)
data("karim")
Phi <- 2 * karim$kin1
P = MASS::mvrnorm(1, rep(0,600), karim$s.g * Phi)
y <- 1 + karim$G %*% karim$b + P + rnorm(600,0,karim$s.e)

pheno_dat <- data.frame(Y = y, id = paste0("ID",1:length(y)))
x1 <- cbind(rep(1, nrow(karim$G)))
fit <- gaston::lmm.aireml(y, x1, K = Phi)

gaston_resid <- y - (fit$BLUP_omega + fit$BLUP_beta)
hist(gaston_resid)
fitglmnet <- glmnet::cv.glmnet(x = karim$G,
                               y = gaston_resid,
                               standardize = T, alpha = 1, intercept = T)
plot(fitglmnet)
pacman::p_load(magrittr)
head(coef(fitglmnet))
yhat <- predict(fitglmnet, newx = karim$G, s = "lambda.min")
plot(yhat, y)
karim$s.g
karim$s.e
fit_2$tau
fit_2$sigma2
karim$h.tot

# need an ID variable
dat <- data.frame(Y = y, x=1, id = 1:600)

# provide the kinship matrix
gfit1 <- coxme::lmekin(Y ~ x + (1|id), data=dat, varlist=Phi)
gfit1


karim$h.tot



fit <- ggmix(x = karim$G,
             y = y,
             kinship = Phi,
             verbose = 2)
hdbic <- gic(fit, an = log(600))
help(gic)
plot(hdbic)
dev.off()

# error variance
coef(hdbic, type = "nonzero")["sigma2",] * (1-coef(hdbic, type = "nonzero")["eta",])

#kinship variance
coef(hdbic, type = "nonzero")["sigma2",] * (coef(hdbic, type = "nonzero")["eta",])



# }
# plot(ns, res)

# plot(dat$y - (fit$BLUP_omega + fit$BLUP_beta),
#      newy)
# all.equal(dat$y - (fit$BLUP_omega + fit$BLUP_beta),
#      newy)
# abline(a=0,b=1)
gaston_resid <- dat$y - (fit$BLUP_omega + fit$BLUP_beta)
hist(gaston_resid)
fitglmnet <- glmnet::cv.glmnet(x = dat$x, y = gaston_resid, standardize = T, alpha = 1, intercept = T)
plot(fitglmnet)



res$causal
res$not_causal %>% length()
res$Xtest %>% colnames() %>% length
all(colnames(res$Xtest) == res$not_causal)
# res$x_lasso
hist(res$y)
res$kin %>% dim



# analyze results ---------------------------------------------------------

source("simulation/packages.R")
df <- readRDS("simulation/simulation_results/june_29_2018_results.rds")
df <- as.data.frame(evals(sim))
df <- df %>% separate(Model, into = c("simnames","b0","eta","Fst","geography","k","n","pkinship","pdesign","percentcausal","percentoverlap","s","sigma2"),
                      sep = "/")
DT <- as.data.table(df)

trop <- RSkittleBrewer::RSkittleBrewer("trop")

cbbPalette <- c("#8720B6","#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
trop <- RSkittleBrewer::RSkittleBrewer("trop")

gg_sy <- theme(legend.position = "bottom", axis.text = element_text(size = 20),
               axis.title = element_text(size = 20), legend.text = element_text(size = 20),
               legend.title = element_text(size = 20),plot.title = element_text(size = 20) )

appender <- function(string) TeX(paste(string))

DT[percentoverlap=="percent_overlap_0", p_overlap := 0]
DT[percentoverlap=="percent_overlap_100", p_overlap := 100]
DT[, table(percentoverlap, p_overlap)]

DT[geography=="geography_ind", geo := "block"]
DT[geography=="geography_circ", geo := "circular"]
DT[geography=="geography_1d", geo := "1D"]
DT[, table(geography)]

DT[, scenario:= as.numeric(as.character(stringr::str_extract_all(parameterIndex, "\\d", simplify = T)))]
DT$scenario %>% table
DT[, scen:=ifelse(scenario==1,"Strong Hierarchy",ifelse(scenario==2, "Weak Hierarchy", ifelse(scenario==3,"Interactions Only",ifelse(scenario==4, "Strong Hierarchy (Linear)", ifelse(scenario==5, "Main Effects Only", "Linear v2")))))]
DT$scen %>% table
DT[, scen:=factor(scen, levels = c("Strong Hierarchy", "Weak Hierarchy","Interactions Only","Strong Hierarchy (Linear)","Linear v2","Main Effects Only"))]
DT$scen %>% table



sim %>%
  plot_eval(metric_name = "correct_sparsity")
sim %>%
  plot_eval(metric_name = "prederror")# + ylim(c(0, 10))
sim %>%
  plot_eval(metric_name = "me")# + ylim(c(0, 10))
# simulator::draws(sim)
# we added an extra simulation scenario here.
# sim <- simulator::load_simulation(name_of_simulation, dir = "simulation/")
#
# mref <-  generate_model(make_model = make_mixed_model_SSC, b0 = 0, sigma2 = 4,
#                         eta = 0.10,
#                         percent_causal = 1,
#                         percent_overlap = list("0","100"),
#                         vary_along = "percent_overlap")
#
# dref <- simulate_from_model(mref, nsim = 4, index = 1:50)
# sim <- simulator::add(sim, mref)
# sim <- simulator::add(sim, dref)
# oref <- simulator::run_method(dref, list(lassoPCpf, PENFAM, TWOSTEP),
#                               parallel = list(socket_names = 35,
#                                               libraries = c("glmnet","magrittr","MASS","progress","Matrix","coxme","gaston")))
# sim <- simulator::add(sim, oref)
# simulator::save_simulation(sim)
# sim
#
#

## @knitr load-results

sim <- simulator::load_simulation(name_of_simulation, dir = "simulation/")


## @knitr plots

sim %>%
  subset_simulation(methods = c("ggmix","lasso")) %>%
  plot_eval(metric_name = "mse", scales = "free") + panel_border() #+
# ggplot2::geom_hline(yintercept = 0.5)

sim %>%
  subset_simulation(methods = c("ggmix")) %>%
  plot_eval(metric_name = "sigma2") + panel_border()

# appender <- function(string) TeX(paste(string))
# df <- as.data.frame(evals(sim))
# df <- df %>% separate(Model, into = c("simnames","beta0","eta","percent_causal","percent_overlap","sigma2"),
#                       sep = "/")
#
# DT <- as.data.table(df, stringsAsFactors = FALSE)
# DT[Method=="lassoPCpf", Method := "lasso"]
# DT[Method=="penfam", Method := "ggmix"]
# DT[Method=="twostep", Method := "2 step"]
# DT[, table(Method, useNA = "al")]
# DT[, Method := droplevels(Method)]
# DT[, table(Method, useNA = "al")]
# DT[, scen := case_when(percent_overlap == "percent_overlap_0" ~ "No overlap",
#                        percent_overlap == "percent_overlap_100" ~ "100% overlap")]
# DT[,table(scen, useNA = "al")]
# pacman::p_load_gh("hrbrmstr/hrbrthemes")
# pacman::p_load(ggrepel)
# # pacman::p_load(Cairo)
# pacman::p_load(extrafont)
# extrafont::loadfonts()
#
#
# df_tpr_nactive <- DT[, c("Method","scen","tpr","nactive")] %>%
#   group_by(Method, scen) %>%
#   summarise(mean.tpr = mean(tpr, na.rm = TRUE), sd.tpr = sd(tpr, na.rm = TRUE),
#             mean.nactive = mean(nactive, na.rm = TRUE), sd.nactive = sd(nactive, na.rm = TRUE))
#
# p1_tpr_nactive <- ggplot(data = df_tpr_nactive, aes(x = mean.nactive, y = mean.tpr, color = Method, label = Method)) +
#   geom_point(size = 2.1) +
#   geom_text_repel(
#     data = subset(df_tpr_nactive, mean.nactive < 40),
#     nudge_x      = 20,
#     direction    = "y",
#     hjust        = 0,
#     segment.size = 0.2
#   ) +
#   geom_text_repel(
#     data = subset(df_tpr_nactive, mean.nactive >= 40),
#     nudge_x      = 5,
#     direction    = "y",
#     hjust        = 0,
#     segment.size = 0.2
#   ) +
#   geom_errorbar(aes(ymin = mean.tpr - sd.tpr, ymax = mean.tpr + sd.tpr), size = 1.1) +
#   geom_errorbarh(aes(xmin = mean.nactive - sd.nactive, xmax = mean.nactive + sd.nactive), size = 1.1) +
#   facet_rep_wrap(~scen, scales = "free", ncol = 2,
#                  repeat.tick.labels = 'left',
#                  labeller = as_labeller(appender,
#                                         default = label_parsed)) +
#   # scale_color_brewer(palette = "Dark2")+
#   scale_color_manual(values=RColorBrewer::brewer.pal(12, "Paired")[-11], guide=guide_legend(ncol=3)) +
#   labs(x="Number of active variables", y="True Positive Rate",
#        title="True Positive Rate vs. Number of Active Variable (Mean +/- 1 SD)",
#        subtitle="Based on 200 simulations",
#        caption="") +
#   theme_ipsum_rc(axis_title_just = "bt") +
#   theme(legend.position = "right",
#         legend.text=element_text(size=14),
#         strip.text = element_text(size=14))
#
# reposition_legend(p1_mse_nactive, 'center', panel='panel-2-3')



## @knitr tpr

tabulate_eval(sim, "tpr", output_type = "markdown",
              caption = "Mean True Positive Rate (Standardar Error) over 200 simulations",
              format_args = list(nsmall = 3, digits = 0))

## @knitr fpr

tabulate_eval(sim, "fpr", output_type = "markdown",
              format_args = list(digits = 4, nsmall=3))


## @knitr nactive

tabulate_eval(sim, "nactive", output_type = "markdown",
              format_args = list(digits = 3, nsmall=3))

## @knitr model-error

tabulate_eval(sim, "me", output_type = "markdown",
              format_args = list(digits = 3, nsmall=3))

## @knitr pred-error

tabulate_eval(sim, "me", output_type = "markdown",
              format_args = list(digits = 3, nsmall=3))

## @knitr not-used


# # sim %>% evaluate(list(muy))
# # warnings()
# #
# plot_eval(sim, metric_name = "eta") + panel_border()
# plot_eval(sim, metric_name = "sigma2") + panel_border()
# plot_eval(sim, metric_name = "rmse") + panel_border()
# # plot_eval(sim, metric_name = "r2") + panel_border()
# #
# plot_eval(sim, metric_name = "time") + panel_border()
# plot_eval(sim, metric_name = "fpr") + panel_border() #+ ylim(c(0,1))
# plot_eval(sim, metric_name = "tpr") + panel_border() #+ ylim(c(0,1))
# plot_eval(sim, metric_name = "r2") + panel_border() #+ ylim(c(0,1))
#
#
#
#
# sim %>% evaluate(list(tpr,fpr))
# df <- as.data.frame(evals(sim))
# dfn <- df
# dfn$Method <- as.character(dfn$Method)
# dfn[dfn$Method=="penfam","Method"] <- "ggmix"
#
# pdf("~/Dropbox/jobs/hec/talk/tpr_fpr.pdf")#, width = 11, height = 8)
# ggplot(data = dfn[dfn$Method!="twostep",], aes(x=fpr, y=tpr, color = Method)) + geom_point(size=4) +
#   theme(legend.position = "bottom", axis.text=element_text(size=16), axis.title=element_text(size=18,face="bold"),
#         legend.text = element_text(size=16), legend.title = element_text(size=16)) +
#   xlab("False positive rate") + ylab("True positive rate") + xlim(c(0.002,0.062)) +
#   ylim(c(0,0.25))
# dev.off()
#
# pdf("~/Dropbox/jobs/hec/talk/tpr_fpr_all_causal400.pdf")#, width = 11, height = 8)
# p1 <- ggplot(data = dfn, aes(x=fpr, y=tpr, color = Method)) + geom_point(size=4) +
#   theme(legend.position = "bottom", axis.text=element_text(size=16), axis.title=element_text(size=18,face="bold"),
#         legend.text = element_text(size=16), legend.title = element_text(size=16)) +
#   xlab("False positive rate") + ylab("True positive rate") + xlim(c(0.002,0.062)) +
#   ylim(c(0,0.25))
# dev.off()
#
#
# pacman::p_load(gridExtra)
# df <- data.frame(ggmix = "26.7 (1.06)", lasso = "32.8 (0.87)", twostep = "4784.1 (0.19)")
# rownames(df) <- "mean RMSE (sd)"
# t1 <- grid.table(df)
# t1 <- tableGrob(df)
# dev.off()
#
# pdf("~/Dropbox/jobs/hec/talk/tpr_fpr_all_causal400.pdf")#, width = 11, height = 8)
# plot_grid(p1,t1, ncol = 1, rel_heights =  c(1,0.2))
# dev.off()
#
#
# png("~/Dropbox/jobs/hec/talk/tpr_fpr_all_causal400.png", width = 11, height = 8, units = "in", res = 150)
# plot_grid(p1,t1, ncol = 1, rel_heights =  c(1,0.2))
# dev.off()
#
#
#
# theme(axis.text=element_text(size=12),
#       axis.title=element_text(size=14,face="bold"))
#
# ggsave(filename = "/home/sahir/Dropbox/PhD/Year4/IGES/poster/tpr_fpr.png")
# ggsave(filename = "/home/sahir/Dropbox/jobs/hec/talk/tpr_fpr.png")
#
#
# sim %>% evaluate(list(rmse))
# plot_eval(sim, metric_name = "rmse") + panel_border()
#
# df <- melt(as.data.frame(evals(sim)@evals))
# df
#
#
# sim %>%
#   tabulate_eval(metric_name = "rmse",
#                 format_args = list(nsmall = 3, digits = 0),
#                 output_type = "latex")
#
#
# #
# #
# # head(df)
#
# #
# #
# #
# # sim %>% evaluate(list(rmse))
# # sim %>% plot_eval(metric_name = "rmse") + panel_border()
# #
# #
# # sim %>% evaluate(list(tpr,fpr))
# # sim %>% plot_evals("fpr","tpr")
# #
# # sim %>% evaluate(list(sqrerr))
# # sim %>% plot_eval(metric_name = "sqrerr") + panel_border()
# # evals(sim)@evals
# # #
# # #
# # # plot_eval(sim, metric_name = "muy") + panel_border()
# # # plot_eval(sim, metric_name = "rmse",
# # #              type = "raw", main = "p = 1")
# # #
# # # unname()
# # # df <- as.data.frame(evals(sim))
# # # head(df)
# # # melt(as.data.frame(evals(sim)@evals))
# # #
# # # # sim_res <- sim %>%
# # # #   evaluate(list(sqrerr, nnz, best_sqrerr))
# # #
# # # ## @knitr plots
# # #
# # # plot_eval_by(sim, "hisloss", varying = "prob")
# # #
# # # ## @knitr tables
# # #
# # # tabulate_eval(sim, "herloss", output_type = "markdown",
# # #               format_args = list(digits = 1))
# #
# # # model@params$kin[1:10,1:10]
# # # model@params$x %>% dim
# # # draws@draws$r1.1
# # # draws@draws$r1.2
# # #
# dat <- make_mixed_model_not_simulator(b0 = 1, eta = 0.5, sigma2 = 4, percent_causal = 1, percent_overlap = "100")
dat <- make_ADmixed_model_not_simulator(n = 200, p = 5000, ncausal = 20, k = 3, s = 0.5, Fst = 0.1, b0 = 0, beta_mean = 1,
                                        eta = 0.5, sigma2 = 4)
dat <- make_INDmixed_model_not_simulator(n = 200, p = 5000, ncausal = 10, k = 3, s = 0.5, Fst = 0.1, b0 = 0,
                                         beta_mean = 1, eta = 0.5, sigma2 = 4)
# dat$kin[1:25,1:25]
# pheno_dat <- data.frame(Y = dat$y, id = rownames(dat$kin))
pheno_dat <- data.frame(Y = dat$y, id = paste0("ID",1:length(dat$y)))
# all(names(dat$y)==rownames(dat$kin))
# hist(dat$y)
# head(pheno_dat)
# fitting with no intercept, i.e., 0 + (1|id) returns error
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = dat$kin)
# newy <- residuals(fit_lme)
#
# all(rownames(dat$x)==rownames(dat$kin))
x1 <- cbind(rep(1, nrow(dat$x)))
fit <- gaston::lmm.aireml(dat$y, x1, K = dat$kin)
# plot(dat$y - (fit$BLUP_omega + fit$BLUP_beta),
#      newy)
# all.equal(dat$y - (fit$BLUP_omega + fit$BLUP_beta),
#      newy)
# abline(a=0,b=1)
gaston_resid <- dat$y - (fit$BLUP_omega + fit$BLUP_beta)
hist(gaston_resid)
fitglmnet <- glmnet::cv.glmnet(x = dat$x, y = gaston_resid, standardize = T, alpha = 1, intercept = T)
plot(fitglmnet)
# yhat <- predict(fitglmnet, newx = dat$x_lasso, s = "lambda.min")
# as.numeric(sqrt(crossprod(gaston_resid - yhat)))
(nonz2step <- setdiff(rownames(coef(fitglmnet, s = "lambda.min")[nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop=F]),c("(Intercept)")))
(tpr2step <- length(intersect(nonz2step, dat$causal))/length(dat$causal))
dim(dat$x)
dim(dat$x_lasso)
fitglmnet2 <- glmnet::cv.glmnet(x = dat$x_lasso, y = dat$y, standardize = T, alpha = 1, intercept = T,
                                penalty.factor = c(rep(1, 5000), rep(0, 10)))
plot(fitglmnet2)
# yhat2 = predict(fitglmnet2, newx = dat$x_lasso, s = "lambda.min")
# as.numeric(sqrt(crossprod(dat$y - yhat2)))
(nonzlasso <- setdiff(rownames(coef(fitglmnet2, s = "lambda.min")[nonzeroCoef(coef(fitglmnet2, s = "lambda.min")),,drop=F]),c("(Intercept)","")))
(tprlasso <- length(intersect(nonzlasso, dat$causal))/length(dat$causal))

l2norm <- function(x) sqrt(sum(x^2))

l2norm(dat$x %*% dat$beta - dat$x %*% coef(fitglmnet2, s = "lambda.min")[2:(ncol(dat$x)+1),,drop=F])


coef(fitglmnet2, s = "lambda.min")[1,,drop=F]
### two step
length(nonz2step) ; tpr2step

### lasso
length(nonzlasso) ; tprlasso




all(names(dat$y)==rownames(dat$x))
all(rownames(dat$x)==rownames(dat$kin))
fit <- penfam(x = dat$x,
              y = dat$y,
              phi = dat$kin,
              # thresh_glmnet = 1e-10,
              # epsilon = 1e-5,
              # fdev = 1e-7,
              # alpha = 1,
              # tol.kkt = 1e-3,
              # nlambda = 100,
              # an = log(log(length(dat$y))) * log(length(dat$y)),
              # an = log(log(length(dat$y))),
              # an = log(length(dat$y)),
              # lambda_min_ratio  = ifelse(model$n < model$p, 0.01, 0.001),
              # lambda_min_ratio  = 0.01,
              eta_init = 0.5,
              maxit = 100)


# try re-arranigning termsof kinship to match x matrix, try hgp real data
plot(fit, "BIC")
coef(fit)[match(dat$causal, rownames(coef(fit))),]
nonzero = predict(fit, type = "nonzero", s = fit$lambda_min)
nonzero_names = setdiff(rownames(predict(fit, type = "nonzero", s = fit$lambda_min)), c("(Intercept)","eta","sigma2"))
length(intersect(nonzero_names, dat$causal))/length(dat$causal)
length(nonzero_names)

l2norm(dat$x %*% dat$beta - dat$x %*% coef(fit, s = fit$lambda_min)[2:(ncol(dat$x)+1),,drop=F])
#
#
#
# # dat <- make_mixed_model_not_simulator(2,.6, 1, percent_causal = 2.5, percent_overlap = "0")
# dat$kin %>% colnames
# dat$kin %>% rownames
# dat$file_paths
# kins <- snpStats::read.plink(dat$file_paths$X_Phi)
# kins
#
# x <- gaston::read.bed.matrix(dat$file_paths$X_Phi)
# slotNames(x)
# x@snps$chr %>% table
# x@sigma
#
# plot(x@p, x@sigma, xlim=c(0,1))
# t <- seq(0,1,length=101);
# lines(t, sqrt(2*t*(1-t)), col="red")
# plot(2*x@p, x@mu)
# abline(a=0,b=1, col = "red")
#
# as.matrix(x)[1:5,1:5]
# standardize(x) <- "p"
# x@standardize_mu_sigma
# as.matrix(x)[1:5,1:5]
# X <- as.matrix(x)
# # any(is.na(X[,2,drop=F]))
#
# kin <- gaston::GRM(x, autosome.only = FALSE)
# kin[1:5,1:5]
# all(complete.cases(kin))
# eiK <- eigen(kin)
#
# # deal with a small negative eigen value
# eiK$values[ eiK$values < 0 ] <- 0
# any(eiK$values < 0)
# PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*")
# dim(PC)
# plot(PC[,1], PC[,2])
# PC[,1:10]
#
# rownames(kin)
# head(pheno_dat)
# hist(dat$y)
# dat$kin[1:5,1:5]
#
# # fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = xxmat)
#
# pacman::p_load(snpStats)
# xxmat <- snpStats::xxt(kins$genotypes, correct.for.missing = TRUE)
# evv <- eigen(xxmat, symmetric=TRUE)
# pcs <- evv$vectors[,1:5]
# evals <- evv$values[1:5]
# plot(evv$values)
# str(xxmat)
# xxmat[1:5,1:5]/4000
# kinPD <- as(nearPD(xxmat)$mat,"matrix")
# dimnames(kinPD)[[1]] <- dat$file_paths$Phi_names
# dimnames(kinPD)[[2]] <- kin_names$V1
# kin <- kinPD
#
# dat$kin[1:5,1:5]
# all(eigen(dat$kin)$values > 0)
# all(evv$values > 0)
# # pheno_dat <- data.frame(Y = dat$y, id = rownames(dat$kin))
#
# pheno_dat <- data.frame(Y = dat$y, id = rownames(xxmat))
# head(pheno_dat)
#
# # fitting with no intercept, i.e., 0 + (1|id) returns error
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = dat$kin)
# fit_lme <- coxme::lmekin(Y ~ 1 + (1|id), data = pheno_dat, varlist = xxmat)
# pheatmap::pheatmap(dat$kin)
# dat$kin[1:5,1:5]
# fit_lme %>% names
# newy <- residuals(fit_lme)
# plot( newy, dat$y)
# cor(newy,dat$y)
# abline(a=0,b=1)
# fit_lme %>% str
# coef(fit_lme)
# newy <- residuals(fit_lme)
# hist(newy)
# fitglmnet <- glmnet::cv.glmnet(x = dat$x_lasso, y = newy, standardize = F, alpha = 1, intercept = F,
#                                penalty.factor = c(rep(1, 4000), rep(0, 10)))
# plot(fitglmnet)
# yhat = predict(fitglmnet, newx = dat$x_lasso, s = "lambda.min")
# as.numeric(sqrt(crossprod(newy - yhat)))
# nonz <- setdiff(rownames(coef(fitglmnet, s = "lambda.min")[nonzeroCoef(coef(fitglmnet, s = "lambda.min")),,drop=F]),c("(Intercept)"))
# length(intersect(nonz, dat$causal))/length(dat$causal)
# coef(fitglmnet, s = "lambda.min")[1:3,]
#
# as.numeric(sqrt(crossprod(dat$y - (yhat+coef(fit_lme)$fixed+coef(fit_lme)$random$id))))
# as.numeric(sqrt(crossprod(dat$y - (yhat+coef(fit_lme)$fixed))))
#
# fitglmnet2 <- glmnet::cv.glmnet(x = dat$x_lasso, y = dat$y, standardize = T, alpha = 1, intercept = T,
#                                 penalty.factor = c(rep(1, 4000), rep(0, 10)))
# plot(fitglmnet2)
# yhat2 = predict(fitglmnet2, newx = dat$x_lasso, s = "lambda.min")
# as.numeric(sqrt(crossprod(dat$y - yhat2)))
# nonz2 <- setdiff(rownames(coef(fitglmnet2, s = "lambda.min")[nonzeroCoef(coef(fitglmnet2, s = "lambda.min")),,drop=F]),c("(Intercept)"))
# length(intersect(nonz2, dat$causal))/length(dat$causal)
#
#
# fitglmnetwithint <- glmnet::cv.glmnet(x = dat$x, y = newy, standardize = T, alpha = 1)
# plot(fitglmnetwithint)
#
# fit <- penfam(x = dat$x,
#               y = dat$y,
#               phi = dat$kin,
#               thresh_glmnet = 1e-10,
#               epsilon = 1e-5,
#               fdev = 1e-4,
#               alpha = 1,
#               tol.kkt = 1e-3,
#               nlambda = 100,
#               # an = log(log(model$n)) * log(model$n),
#               an = log(log(length(dat$y))),
#               # lambda_min_ratio  = ifelse(model$n < model$p, 0.01, 0.001),
#               lambda_min_ratio  = 0.05,
#               eta_init = 0.5,
#               maxit = 100)
#
# nonzero = predict(fit, type = "nonzero", s = fit$lambda_min)
# nonzero_names = setdiff(rownames(predict(fit, type = "nonzero", s = fit$lambda_min)), c("(Intercept)","eta","sigma2"))
# plot(fit)
#
# require(snpStats)
# data(for.exercise)
# controls <- rownames(subject.support)[subject.support$cc==0]
# use <- seq(1, ncol(snps.10), 10)
# ctl.10 <- snps.10[controls,use]
#
#
# ###################################################
# ### code chunk number 2: xxt-matrix
# ###################################################
# xxmat <- xxt(ctl.10, correct.for.missing=FALSE)
# xxmat[1:5,1:5]
#
# ###################################################
# ### code chunk number 3: eigen
# ###################################################
# evv <- eigen(xxmat, symmetric=TRUE)
# pcs <- evv$vectors[,1:5]
# evals <- evv$values[1:5]
# evals
#
#
#
# # dat$causal
# # dat$x %>% dim
# # dat$y %>% hist
# # dat$kin %>% dim
# # sum(dat$y>0)
# # plot(dat$y)
# # range(dat$y)
# # plot(density(dat$y))
# # # sum(draws@draws$r1.1>0)
# # # sum(draws@draws$r1.2>0)
# # # plot(dat$y, draws@draws$r1.1)
# # #
# # system.time(
# # fit <- penfam(x = dat$x,
# #               y = dat$y,
# #               phi = dat$kin,
# #               thresh_glmnet = 1e-10,
# #               epsilon = 1e-5,
# #               fdev = 1e-7,
# #               alpha = 1,
# #               tol.kkt = 1e-3,
# #               nlambda = 100,
# #               # an = log(log(1000)) * log(1000),
# #               an = log(log(1000)),
# #               # an = log(1000),
# #               # lambda_min_ratio  = ifelse(model$n < model$p, 0.01, 0.001),
# #               lambda_min_ratio  = 0.05,
# #               eta_init = 0.5,
# #               maxit = 100)
# # )
# # #
# # plot(fit, type = "BIC")
# # sum(rownames(coef(fit, s = fit$lambda_min)[nonzeroCoef(coef(fit, s = fit$lambda_min)),,drop=F]) %in% dat$causal)
# # rownames(coef(fit, s = fit$lambda_min)[nonzeroCoef(coef(fit, s = fit$lambda_min)),,drop=F]) %>% length()
# # sqrt(crossprod(fit$predicted[,fit$lambda_min] - dat$y))
# # #
# # #
# # #
# # #
# # fitglmnet <- cv.glmnet(x = dat$x, y = dat$y, alpha = 1, standardize = F)
# # plot(fitglmnet)
# # sum(rownames(coef(fitglmnet)[nonzeroCoef(coef(fitglmnet)),,drop=F] ) %in% dat$causal)
# # coef(fitglmnet)[nonzeroCoef(coef(fitglmnet)),,drop=F]
# # coef(fitglmnet)[nonzeroCoef(coef(fitglmnet)),,drop=F] %>% dim
# # sqrt(crossprod(predict(fitglmnet, newx = dat$x) - dat$y))
sahirbhatnagar/penfam documentation built on April 14, 2021, 9:38 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sahirbhatnagar/penfam
Variable Selection in Linear Mixed Models for SNP Data

simulation/main.R
In sahirbhatnagar/penfam: Variable Selection in Linear Mixed Models for SNP Data

R Package Documentation

Browse R Packages

We want your feedback!

sahirbhatnagar/penfam Variable Selection in Linear Mixed Models for SNP Data

simulation/main.R In sahirbhatnagar/penfam: Variable Selection in Linear Mixed Models for SNP Data

R Package Documentation

Browse R Packages

We want your feedback!

sahirbhatnagar/penfam
Variable Selection in Linear Mixed Models for SNP Data

simulation/main.R
In sahirbhatnagar/penfam: Variable Selection in Linear Mixed Models for SNP Data