dataAnalysisReview/test.R

# A test file to start.

library(readr)
library(tibble)
library(dplyr)
library(pROC)

dat_full <- read_csv("Breast_GSE70947.csv")
n <- nrow(dat_full)

ind_norm <- which(dat_full$type =="normal")
ind_cancer <- (1:n)[-ind_norm]
cancer_yes <- numeric(n)
cancer_yes[ind_cancer] <- 1

# Let's start with a PCA of the variables.
#
# pca_full <- prcomp(dat_full[,-(1:2)], rank. = 100)
# summary(pca_full)
#
# new_cov <- pca_full$x

# Generate a small data with the first 10 principal components.
### NOT TESTING WITH THIS, PCA components too predictive.
# mydat <- tibble("y" = as.factor(cancer_yes)) %>%
#   bind_cols(as_tibble(new_cov[,1:10]))

# Generate a small data picking the first 10 genes.
mydat <- tibble("y" = as.factor(cancer_yes)) %>%
  bind_cols(dat_full[,3:102])

# plot(new_cov[ind_norm,1], new_cov[ind_norm,2], col = "blue",
#      xlim = range(new_cov[,1]), ylim = range(new_cov[,2]))
# points(new_cov[ind_cancer,1], new_cov[ind_cancer,2], col = "red")

# Let's create a simple training, testing and validation set.
set.seed(1)
sam_set <- sample(1:3, size = n, replace=TRUE, prob = c(0.60,0.2,0.2))
dat_train <- mydat[sam_set==1,]
dat_test <- mydat[sam_set==2,]
dat_val <- mydat[sam_set==3,]

# Fit a simple GLM
mod_glm <- glm(y~., data = dat_train,
               family = binomial())
test_phat <- predict(mod_glm, type = "response", newdata = dat_test)
val_phat <- predict(mod_glm, type = "response", newdata = dat_val)

roc_train <- roc(response = dat_train$y, predictor = mod_glm$fitted.values)
roc_test <- roc(response = dat_test$y, predictor = test_phat)
roc_val <- roc(response = dat_val$y, predictor = val_phat)

cat("Train AUC: ", round(roc_train$auc*100,1), "%\n",sep = "")
cat("Test AUC: ", round(roc_test$auc*100,1), "%\n",sep = "")
cat("Val AUC: ", round(roc_val$auc*100,1), "%\n",sep = "")

###########################################################
###########################################################
###########################################################

# A helper function for running analysis from models.
run.test <- function(obj, dat_train, dat_test, dat_val,
                     pred.fun = predict.spamLL) {
  train_phat <- pred.fun(obj, as.matrix(dat_train[,-1]))
  test_phat <- pred.fun(obj, as.matrix(dat_test[,-1]))
  val_phat <- pred.fun(obj, as.matrix(dat_val[,-1]))

  auc_train <- auc_test <- auc_val <- numeric(ncol(train_phat))
  for(i in 1:ncol(train_phat) ){
    #print(i)
    roc_train <- roc(response = dat_train$y, predictor = train_phat[,i], quiet = TRUE)
    roc_test <- roc(response = dat_test$y, predictor = test_phat[,i], quiet = TRUE)
    roc_val <- roc(response = dat_val$y, predictor = val_phat[,i], quiet = TRUE)
    auc_train[i] <- roc_train$auc
    auc_test[i] <- roc_test$auc
    auc_val[i] <- roc_val$auc
  }

  ind <- which.max(auc_test)
  cat("Train AUC: ", round(auc_train[ind]*100,1), "%\n",sep = "")
  cat("Test AUC: ", round(auc_test[ind]*100,1), "%\n",sep = "")
  cat("Val AUC: ", round(auc_val[ind]*100,1), "%\n",sep = "")
  return(c("train" = auc_train[ind], "test" = auc_test[ind],
           "val" = auc_val[ind]))
}

###########################################################
###########################################################
###########################################################
# Lasso now.

library(glmnet)
mod_lasso <- glmnet(x = as.matrix(dat_train[,-1]),y =  dat_train$y,
                    family = "binomial",
                    nlambda = 50, lambda.min.ratio = 1e-5)
pred.lasso <- function(obj, dat) {
  predict(obj, newx = dat, type = "response")
}
lasso_res <- run.test(obj = mod_lasso,dat_train =  dat_train,dat_test =  dat_test,
         dat_val = dat_val, pred.fun = pred.lasso)


###########################################################
###########################################################
###########################################################
# Now SPAM, sparse additive modeling
library(SAM)
#?samLL

mod_spam2 <- samLL(X = dat_train[,-1],
                   y = as.numeric(as.character(dat_train$y)), p=2,
                   nlambda = 50, lambda.min.ratio = 1e-2)
mod_spam4 <- samLL(X = dat_train[,-1],
                   y = as.numeric(as.character(dat_train$y)), p=4,
                   nlambda = 50, lambda.min.ratio = 1e-2)
mod_spam6 <- samLL(X = dat_train[,-1],
                   y = as.numeric(as.character(dat_train$y)), p=6,
                   nlambda = 50, lambda.min.ratio = 1e-2)
spam2_res <- run.test(mod_spam2, dat_train, dat_test, dat_val,
                      pred.fun = predict.spamLL)
spam4_res <- run.test(mod_spam4, dat_train, dat_test, dat_val,
                      pred.fun = predict.spamLL)
spam6_res <- run.test(mod_spam6, dat_train, dat_test, dat_val,
                      pred.fun = predict.spamLL)

######################################################################
######################################################################
######################################################################

# Now for GSAM with Sobolev penalty

library(GSAM)
mod_ssp <- fit.additive(y=dat_train$y , x= dat_train[, -1],
                         family="binomial", method = "sobolev",
                         lambda.max = 3,
                         lambda.min.ratio = 1e-3, max.iter = 100)

gsam_pred <- function(obj, dat) {
  GSAM:::predict.add_mod(obj, new.data = dat,
                         type  = "response")
}

ssp_res <- run.test(mod_ssp, dat_train, dat_test, dat_val, pred.fun = gsam_pred)


######################################################################
######################################################################
######################################################################

# Finally, we have the Trend filtering penalties.


mod_tf0 <- fit.additive(y=dat_train$y , x= dat_train[, -1],
                        family="binomial", method = "tf", k = 0,
                        lambda.max = 3,
                        lambda.min.ratio = 1e-3, max.iter = 100)

mod_tf1 <- fit.additive(y=dat_train$y , x= dat_train[, -1],
                        family="binomial", method = "tf", k = 1,
                        lambda.max = 3,
                        lambda.min.ratio = 1e-3, max.iter = 100)
mod_tf2 <- fit.additive(y=dat_train$y , x= dat_train[, -1],
                        family="binomial", method = "tf", k = 2,
                        lambda.max = 3,
                        lambda.min.ratio = 1e-3, max.iter = 100)
tf0_res <- run.test(mod_tf0, dat_train, dat_test, dat_val,
                    pred.fun = gsam_pred)
tf1_res <- run.test(mod_tf1, dat_train, dat_test, dat_val,
                    pred.fun = gsam_pred)
tf2_res <- run.test(mod_tf2, dat_train, dat_test, dat_val,
                    pred.fun = gsam_pred)



########################################################
########################################################
########################################################

# Testing parallelization of method
max.lambda <- 1
lambda.min.ratio <- 1e-3

mod_serial <- fit.additive(y=dat_train$y, x=dat_train[,-1],
                           family="binomial", method = "sobolev",
                           lambda.max = max.lambda,
                           lambda.min.ratio = lambda.min.ratio,
                           max.iter = 100, parallel = FALSE)

mod_para <- fit.additive(y=dat_train$y, x=dat_train[,-1],
                        family="binomial", method = "sobolev",
                        lambda.max = max.lambda,
                        lambda.min.ratio = lambda.min.ratio,
                        max.iter = 100, parallel = TRUE)


########################################################
########################################################
########################################################
# Testing Other datasets.

# We will look at spambase dataset package kernlab

data(spam, package = "kernlab")

set.seed(1)
dat <- process.dat2("spam")


n <- length(dat$y)

sam_set <- sample(1:3, size = n, replace=TRUE,
                  prob = c(0.60,0.2,0.2))
dat_train <- dat[sam_set==1,]
dat_test <- dat[sam_set==2,]
dat_val <- dat[sam_set==3,]

lasso <- simulation.lasso(dat_train, dat_test, dat_val,
                          lambda.min.ratio = 1e-4)


spam2 <- simulation.spam(dat_train, dat_test, dat_val, nbasis = 2,
                         lambda.min.ratio = 5e-4)


#######################################################

# Testing error in SPAM

dat <- process.dat("Breast_GSE70947")
set.seed(1)
#dat <- process.dat2("spam")


n <- length(dat$y)

sam_set <- sample(1:3, size = n, replace=TRUE,
                  prob = c(0.60,0.2,0.2))
dat_train <- dat[sam_set==1,]
dat_test <- dat[sam_set==2,]
dat_val <- dat[sam_set==3,]


X <- as.matrix(dat_train[,-c(1,14110)])
y <- as.numeric(as.character(dat_train$y))
p <- 10


fit = list()
fit$p = p
fit = list()
X = as.matrix(X)
y = as.vector(y)
n = nrow(X)
d = ncol(X)
m = d * p
n1 = sum(y == 1)
n0 = sum(y == 0)
if ((n1 + n0) != n) {
  cat("Please check the labels. (Must be coded in 1 and 0)")
  fit = "Please check the labels."
  return(fit)
}
fit$p = p
X.min = apply(X, 2, min)
X.max = apply(X, 2, max)
X.ran = X.max - X.min
X.min.rep = matrix(rep(X.min, n), nrow = n, byrow = T)
X.ran.rep = matrix(rep(X.ran, n), nrow = n, byrow = T)
X = (X - X.min.rep)/X.ran.rep
fit$X.min = X.min
fit$X.ran = X.ran
Z = matrix(0, n, m)
fit$nkots = matrix(0, p - 1, d)
fit$Boundary.knots = matrix(0, 2, d)
for (j in 1:d) {
  print(j)
  tmp = (j - 1) * p + c(1:p)
  tmp0 = ns(X[, j], df = p)
  Z[, tmp] = tmp0
  fit$nkots[, j] = attr(tmp0, "knots")
  fit$Boundary.knots[, j] = attr(tmp0, "Boundary.knots")
}

ns(problem, df = 3)
asadharis/PGSAME documentation built on Feb. 18, 2021, 9:14 p.m.