my_sims/mcgill_simulations_gendata_cvfit.R

# for i in 1 ; do qsub -v index=$i mcgill-simulation.sh ; done

pacman::p_load(splines)
pacman::p_load(magrittr)
pacman::p_load(foreach)
pacman::p_load(methods)
pacman::p_load(doMC)
pacman::p_load(profvis)
library(sail)
# pacman::p_load(gamsel)

# rm(list=ls())
# dev.off()
# devtools::load_all("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_lambda_branch/")
# devtools::load_all("/home/sahir/git_repositories/sail/")
source("/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_git_v2/sail/my_sims/model_functions.R")

message("loaded packages")
parameterIndex <- as.numeric(as.character(commandArgs(trailingOnly = T)[1]))
message(parameterIndex)
# parameterIndex = 1

if (parameterIndex == 1) { # 1a
  hierarchy = "strong" ; nonlinear = TRUE ; interactions = TRUE
} else if (parameterIndex == 2) { # 1b
  hierarchy = "weak" ; nonlinear = TRUE ; interactions = TRUE
} else if (parameterIndex == 3) { # 1c
  hierarchy = "none" ; nonlinear = TRUE ; interactions = TRUE
} else if (parameterIndex == 4) { # 2
  hierarchy = "strong"; nonlinear = FALSE; interactions = TRUE
} else if (parameterIndex == 5) { # 3
  hierarchy = "strong" ; nonlinear = TRUE ; interactions = FALSE
} else if (parameterIndex == 6) { # this is scenario 2 but we fit sail with degree=1
  hierarchy = "strong"; nonlinear = FALSE; interactions = TRUE
}

lambda.type <- "lambda.min"
# hierarchy = "strong", nonlinear = TRUE, interactions = TRUE, # scenario 1a
# hierarchy = "weak", nonlinear = TRUE, interactions = TRUE, # scenario 1b
# hierarchy = "none", nonlinear = TRUE, interactions = TRUE, # scenario 1c
# hierarchy = "strong", nonlinear = FALSE, interactions = TRUE, # scenario 2
# hierarchy = "strong", nonlinear = TRUE, interactions = FALSE, # scenario 3

# Simulate Data -----------------------------------------------------------

n = 400
p = 1000

# DT <- gendataPaper(n = n, p = p, SNR = 2, betaE = 1,
#                    hierarchy = hierarchy, nonlinear = nonlinear, interactions = interactions,
#                    corr = 0,
#                    E = truncnorm::rtruncnorm(n, a = -1, b = 1))

draw <- make_gendata_Paper_data_split_not_simulator(n = n, p = p, corr = 0,
                                                    betaE = 2, SNR = 2, lambda.type = "lambda.min", parameterIndex = parameterIndex)

# message("simulated data")
# fit <- sail(x = draw[["xtrain"]], y = draw[["ytrain"]], e = draw[["etrain"]],
#             basis = function(i) splines::bs(i, degree = 5))
# message("ran sail")
# ytest_hat <- predict(fit, newx = draw[["xtest"]], newe = draw[["etest"]])
# msetest <- colMeans((draw[["ytest"]] - ytest_hat)^2)
# lambda.min.index <- as.numeric(which.min(msetest))
# lambda.min <- fit$lambda[which.min(msetest)]
# # plot(log(fit$lambda), msetest)
# yvalid_hat <- predict(fit, newx = draw[["xvalid"]], newe = draw[["evalid"]], s = lambda.min)
# msevalid <- mean((draw[["yvalid"]] - drop(yvalid_hat))^2)
#
# nzcoef <- predict(fit, s = lambda.min, type = "nonzero")


# res <- list(beta = coef(fit, s = lambda.min)[-1,,drop=F],
#             fit = fit,
#             lambda.min = lambda.min,
#             lambda.min.index = lambda.min.index,
#             vnames = draw[["vnames"]],
#             nonzero_coef = nzcoef,
#             active = fit$active[[lambda.min.index]],
#             not_active = setdiff(draw[["vnames"]], fit$active[[lambda.min.index]]),
#             yvalid_hat = yvalid_hat,
#             msevalid = msevalid,
#             causal = draw[["causal"]],
#             not_causal = draw[["not_causal"]],
#             yvalid = draw[["yvalid"]])

registerDoMC(cores = 10)

if (parameterIndex != 6) {
cvfit <- cv.sail(x = draw[["xtrain"]], y = draw[["ytrain"]], e = draw[["etrain"]],
                 basis = function(i) splines::bs(i, degree = 5),
                 parallel = TRUE,
                 nfolds = 10)
} else if (parameterIndex == 6){
  cvfit <- cv.sail(x = DT$x, y = DT$y, e = DT$e, degree = 1, basis.intercept = FALSE,
                   thresh = 1e-4,
                   maxit = 1000,
                   alpha = .05,
                   parallel = TRUE,
                   # foldid = foldid,
                   nfolds = 10, verbose = T, nlambda = 100)
  DT$scenario <- "6"
}

cvfit$sail.fit$x <- draw[["xtrain"]]


# plot(cvfit)
# plot(cvfit2)
# plot(cvfit$sail.fit)
# cvfit$sail.fit
# coef(cvfit, s = "lambda.min")[nonzero(coef(cvfit, s = "lambda.min")),,drop=F]
# coef(cvfit, s = "lambda.1se")[nonzero(coef(cvfit, s = "lambda.1se")),,drop=F]
# coef(cvfit2, s = "lambda.min")[nonzero(coef(cvfit2, s = "lambda.min")),,drop=F]
# coef(cvfit2, s = "lambda.1se")[nonzero(coef(cvfit2, s = "lambda.1se")),,drop=F]
# cvfit <- cvfit2
# par(mfrow=c(2,2))
# for (i in 1:4){
#   xv <- paste0("X",i)
#   ind <- cvfit$sail.fit$group == which(cvfit$sail.fit$vnames == xv)
#   design.mat <- cvfit$sail.fit$design[,cvfit$sail.fit$main.effect.names[ind],drop = FALSE]
#   # f.truth <- design.mat %*% DT$b1
#   f.truth <- DT[[paste0("f",i)]]
#   plotMain(object = cvfit$sail.fit, xvar = xv, s = cvfit$lambda.min, f.truth = f.truth, legend.position = "topleft")
# }

if (parameterIndex != 6) {
  saveRDS(object = cvfit,
          file = tempfile(pattern = sprintf("cvfit_thesis_n200_p1000_SNR2_betaE2_df5_%s",parameterIndex),
                          tmpdir = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_lambda_branch/mcgillsims/thesis_p1000_1a",
                          fileext = ".rds")
  )
} else if (parameterIndex == 6) {
  saveRDS(object = cvfit,
          file = tempfile(pattern = sprintf("cvfit_gendata2_n200_p1000_SNR2_betaE1_df1_degree1_alpha05_%s_",DT$scenario),
                          tmpdir = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_lambda_branch/mcgillsims/gendata2_p1000_1c_2_3_6",
                          fileext = ".rds")
  )
}

message("saved data")

# if (parameterIndex != 6) {
#   saveRDS(object = cvfit,
#           file = tempfile(pattern = sprintf("cvfit_gendata2_n200_p1000_SNR2_betaE1_df5_degree3_alpha05_%s_",DT$scenario),
#                           tmpdir = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_lambda_branch/mcgillsims/gendata2_p1000_1c_2_3_6",
#                           fileext = ".rds")
#   )
# } else if (parameterIndex == 6) {
#   saveRDS(object = cvfit,
#           file = tempfile(pattern = sprintf("cvfit_gendata2_n200_p1000_SNR2_betaE1_df1_degree1_alpha05_%s_",DT$scenario),
#                           tmpdir = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_lambda_branch/mcgillsims/gendata2_p1000_1c_2_3_6",
#                           fileext = ".rds")
#   )
# }
# files = list.files(path = '/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/sail/sail_lambda_branch/mcgillsims',
#                    pattern = '*.rds', full.names = TRUE)
# dat_list = lapply(files, function (x) readRDS(x))
# plot(dat_list[[2]])
sahirbhatnagar/funshim documentation built on July 18, 2021, 3:59 p.m.