Guide.R
In MTPS: Multi-Task Prediction using Stacking Algorithms

## ----library, echo = T, results = 'hide', warning=FALSE, message=FALSE--------
library(MTPS)

## ----data---------------------------------------------------------------------
data("HIV")
head(YY)
XX[8:9, 7:10]
dim(YY)
dim(XX)

## ----ctn-split-data, echo = T, results = 'hide'-------------------------------
set.seed(12345)
xmat <- as.matrix(XX)
ymat <- as.matrix(YY)
nobs <- nrow(xmat)
id <- createFolds(rowMeans(XX), k=5, list=F)
training.id <- sample(seq_len(nobs), size = 0.8 * nobs)
y.train <- ymat[training.id, ]
y.test  <- ymat[-training.id, ]
x.train <- xmat[training.id, ]
x.test  <- xmat[-training.id, ]

## ----ctn-noss, echo = T, results = 'hide'-------------------------------------
# no stacking
fit.mult <- multiFit(xmat = x.train, ymat = y.train, method = glmnet.ridge, family = "gaussian")

## ----ctn-train, echo = T, results = 'hide', eval=FALSE------------------------
#  # Standard Stacking
#  fit.ss <- MTPS(xmat = x.train, ymat = y.train, family = "gaussian",
#                              cv = FALSE, residual = FALSE,
#                              method.step1 = glmnet.ridge,
#                              method.step2 = rpart1)
#  # Cross-Validation Stacking
#  fit.cv <- MTPS(xmat = x.train, ymat = y.train, family = "gaussian",
#                              cv = TRUE, residual = FALSE,
#                              method.step1 = glmnet.ridge,
#                              method.step2 = rpart1)
#  # Residual Stacking
#  fit.rs <- MTPS(xmat = x.train, ymat = y.train, family = "gaussian",
#                              cv = FALSE, residual = TRUE,
#                              method.step1 = glmnet.ridge,
#                              method.step2 = rpart1)
#  # Cross-Validation Residual Stacking
#  fit.cvrs <- MTPS(xmat = x.train, ymat = y.train, family = "gaussian",
#                              cv = TRUE, residual = TRUE,
#                              method.step1 = glmnet.ridge,
#                              method.step2 = rpart1)

## ----echo=F,eval=T------------------------------------------------------------
data("Internal")

## ----ctn-predict, echo = T, results = 'hide', eval=FALSE----------------------
#  # no stacking
#  pred.mult <- predict(fit.mult, x.test)
#  # Standard Stacking
#  pred.ss <- predict(fit.ss, x.test)
#  # Cross-Validation Stacking
#  pred.cv <- predict(fit.cv, x.test)
#  # Residual Stacking
#  pred.rs <- predict(fit.rs, x.test)
#  # Cross-Validation Residual Stacking
#  pred.cvrs <- predict(fit.cvrs, x.test)

## ----ctn-outcome, eval=F------------------------------------------------------
#  library(ggplot2)
#  library(reshape2)
#  n.test <- nrow(x.test)
#  ctn.plot.data_matrix <- cbind(rbind(pred.mult, pred.ss, pred.cv, pred.rs, pred.cvrs), y.test[rep(seq_len(n.test), 5), ])
#  ctn.plot.data <-data.frame(rep(c("No-Stacking", "SS", "CV", "RS", "CVRS"), each = n.test),ctn.plot.data_matrix)
#  colnames(ctn.plot.data) <- c("method", paste0(rep("pred.", ncol(y.test)), colnames(y.test)), colnames(y.test))
#  dm1 <- melt(ctn.plot.data[,c("method","ABC","3TC","AZT","D4T", "DDI")], id=c("method"))
#  dm2 <- melt(ctn.plot.data[,c("method","pred.ABC","pred.3TC","pred.AZT","pred.D4T", "pred.DDI")], id=c("method"))
#  ctn.plot.data <- cbind(dm1, dm2[, -1])
#  colnames(ctn.plot.data) <- c("method", "Y", "yVal", "predict", "predictVal")
#  ctn.plot.data$method <- factor(ctn.plot.data$method, unique(as.character(ctn.plot.data$method)))
#  ctn.plot.data$yVal <- as.numeric(ctn.plot.data$yVal)
#  ctn.plot.data$predictVal <- as.numeric(ctn.plot.data$predictVal)
#  ggplot(ctn.plot.data) +
#    geom_point(aes(x=predictVal, y=yVal, color=method), size = 0.5, alpha = 0.8) +
#    geom_abline(slope=1, alpha = 0.2) +
#    coord_fixed() +
#    ylab("Testing Data Outcome") + xlab("Predicted Outcome on Testing Data") +
#    scale_x_discrete(breaks = NULL) +
#    scale_y_discrete(breaks = NULL) +
#    theme_bw() +
#    theme(axis.text = element_blank(),
#          strip.placement = "outside",
#          strip.background = element_blank()) +
#    facet_grid(Y~method)

## ----warning=FALSE, echo=FALSE------------------------------------------------
library(ggplot2)
library(reshape2)
ggplot(ctn.plot.data) +
  geom_point(aes(x=predictVal, y=yVal, color=method), size = 0.5, alpha = 0.8) +
  geom_abline(slope=1, alpha = 0.2) +
  coord_fixed() +
  ylab("Testing Data Outcome") + xlab("Predicted Outcome on Testing Data") +
  scale_x_discrete(breaks = NULL) +
  scale_y_discrete(breaks = NULL) +
  theme_bw() +
  theme(axis.text = element_blank(),
        strip.placement = "outside",
        strip.background = element_blank()) +
  facet_grid(Y~method)

## ----bin-data, echo = T, results = 'hide', eval=FALSE-------------------------
#  # https://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/CUTOFFS/drug.cutoffs
#  # cutoff value to be used to define drug resistent
#  cutoffs <- c(2,3,3,1.5,1.5)
#  ymat.bin <- ymat
#  xmat.bin <- xmat
#  for(ii in 1:5) ymat.bin[,ii] <- (10^ymat[,ii] < cutoffs[ii]) * 1
#  y.train.bin <- ymat.bin[training.id, ]
#  y.test.bin  <- ymat.bin[-training.id, ]
#  x.train.bin <- xmat.bin[training.id, ]
#  x.test.bin  <- xmat.bin[-training.id, ]

## ----bin-train, echo = T, results = 'hide', eval=FALSE------------------------
#  fit.prs.std <- MTPS(xmat = x.train.bin, ymat=y.train.bin,
#                                 family = "binomial",
#                                 cv = FALSE, residual = TRUE,
#                                 method.step1 = rpart1,
#                                 method.step2 = lm1,
#                                 resid.type = "pearson", resid.std = TRUE)
#  pred.prs.std <- predict(fit.prs.std, x.test.bin)

## ----bin-outcome, echo = T----------------------------------------------------
for (yy in 1 : ncol(y.test.bin)) {
  print(colnames(y.test.bin)[yy])
  print(table((pred.prs.std[,yy] > 0.5) * 1, y.test.bin[,yy]))
}


## ----mix-data, echo = T, results = 'hide', eval=FALSE-------------------------
#  ymat.mix <- cbind(ymat[,1:3], ymat.bin[,4:5])
#  xmat.mix <- xmat
#  y.train.mix <- ymat.mix[training.id, ]
#  y.test.mix  <- ymat.mix[-training.id, ]
#  x.train.mix <- xmat.mix[training.id, ]
#  x.test.mix  <- xmat.mix[-training.id, ]

## ----mix-training, echo = T, results = 'hide', warning=FALSE, message=FALSE, eval=FALSE----
#  fit.mix.rs <- MTPS(xmat = x.train.mix, ymat = y.train.mix,
#                  family=c("gaussian","gaussian","gaussian","binomial","binomial"),
#                  cv = FALSE, residual = TRUE,
#                  method.step1 = glmnet.lasso,
#                  method.step2 = rpart1)
#  pred.mix.rs <- predict(fit.mix.rs, x.test.mix)

## ----mix-outcome, eval=FALSE--------------------------------------------------
#  n.test <- nrow(x.test)
#  mix.plot.data <- cbind(rep(colnames(y.test.mix)[1:3], each=nrow(y.test.mix)),
#                         rbind(cbind(pred.mix.rs[, 1], y.test.mix[, 1]),
#                               cbind(pred.mix.rs[, 2], y.test.mix[, 2]),
#                               cbind(pred.mix.rs[, 3], y.test.mix[, 3])))
#  colnames(mix.plot.data) <- c("Y", "predict", "outcome")
#  mix.plot.data <- as.data.frame(mix.plot.data)
#  mix.plot.data$predict <- as.numeric(as.character(mix.plot.data$predict))
#  mix.plot.data$outcome <- as.numeric(as.character(mix.plot.data$outcome))
#  ggplot(mix.plot.data) +
#    geom_point(aes(x=predict, y=outcome, color=Y), size = 0.5, alpha = 0.8) +
#    ylab("Outcome of Testing Data") + xlab("Predicted Outcome of Testing Data") +
#    scale_x_discrete(breaks = NULL) +
#    scale_y_discrete(breaks = NULL) +
#    geom_abline(slope=1, alpha = 0.2) +
#    coord_fixed() +
#    theme_bw() +
#    theme(legend.title = element_blank(),
#          axis.text = element_blank(),
#          strip.placement = "outside",
#          strip.background = element_blank()) +
#    facet_grid(~Y)

## ----echo=FALSE---------------------------------------------------------------
ggplot(mix.plot.data) +
  geom_point(aes(x=predict, y=outcome, color=Y), size = 0.5, alpha = 0.8) +
  ylab("Outcome of Testing Data") + xlab("Predicted Outcome of Testing Data") +
  scale_x_discrete(breaks = NULL) +
  scale_y_discrete(breaks = NULL) +
  geom_abline(slope=1, alpha = 0.2) +
  coord_fixed() +
  theme_bw() +
  theme(legend.title = element_blank(),
        axis.text = element_blank(),
        strip.placement = "outside",
        strip.background = element_blank()) +
  facet_grid(~Y)

## ----echo=T, eval=T-----------------------------------------------------------
for (yy in 4 : 5) {
  print(colnames(y.test.mix)[yy])
  print(table((pred.mix.rs[,yy] > 0.5) * 1, y.test.mix[,yy]))
}

## ----mix-mtd, echo = T, warning=FALSE,message=FALSE, eval=FALSE---------------
#  fit.mixOut <- MTPS(xmat=x.train, ymat=y.train, family="gaussian",
#                  method.step1 =
#                    c(glmnet.lasso,glmnet.lasso,glmnet.lasso,lm1,lm1),
#                  method.step2 =
#                    c(rpart1,glmnet.lasso,glmnet.lasso,glmnet.lasso,glmnet.lasso))
#  pred <- predict(fit.mixOut, x.test)

## ----method-mod, eval=F, echo=T,eval=FALSE------------------------------------
#  glmnet.lasso <- modify.parameter (glmnet1, alpha=1)
#  glmnet.ridge <- modify.parameter (glmnet1, alpha=0)

## ----method-new, eval=F, echo=T,eval=FALSE------------------------------------
#  glm1 <- function(xmat, ymat, family, ...) {
#    tmp0 <- data.frame(yy=ymat, xmat)
#    model <- glm(yy~., data=tmp0, family=family, ...)
#    y.fitted <- fitted(model)
#    predFun <- function(model,xnew){
#      predict(model, newdata=data.frame(xnew), type="response")
#    }
#    return(list(model=model,y.fitted=y.fitted, predFun=predFun))
#  }

## ----learner-compare, echo=T,eval=F-------------------------------------------
#  nsim <- 20
#  mse.lasso.lm <- matrix(NA, nrow = nsim, ncol = ncol(ymat))
#  mse.lasso.lm <- as.data.frame(mse.lasso.lm)
#  colnames(mse.lasso.lm) <-colnames(ymat)
#  mse.ridge.lm <- mse.lasso.lm
#  for (ii in 1:nsim) {
#    set.seed(ii)
#    # lasso stacking with lm
#    mse.lasso.lm[ii,] <- cv.MTPS(xmat, ymat, family="gaussian",
#                              cv = FALSE, residual = TRUE,
#                              method.step1=glmnet.lasso, method.step2=lm1,
#                              resid.std=FALSE)$continuous
#    # ridge stacking with lm
#    mse.ridge.lm[ii,] <- cv.MTPS(xmat, ymat, family="gaussian",
#                              cv = FALSE, residual = TRUE,
#                              method.step1=glmnet.ridge, method.step2=lm1,
#                              resid.std=FALSE)$continuous
#  }
#  # box plot
#  mse.data <- data.frame(lasso=apply(mse.lasso.lm,1,mean),
#                         ridge=apply(mse.ridge.lm,1,mean))
#  mse.data <- melt(mse.data,measure.vars = c("lasso","ridge"))
#  
#  colnames(mse.data) <- c("Learner", "MSE")
#  ggplot(mse.data) +
#    geom_boxplot(aes(x=Learner, y=MSE, fill=Learner)) +
#    ggtitle("Boxplot of average Mean Square Error") +
#    theme_bw() +
#    theme(legend.position = "none",
#          plot.title = element_text(hjust = 0.5),
#          axis.title.x = element_blank())

## ----echo=F,eval=T------------------------------------------------------------
# box plot
ggplot(mse.data) +
  geom_boxplot(aes(x=Learner, y=MSE, fill=Learner)) +
  ggtitle("Boxplot of average Mean Square Error") +
  theme_bw() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        axis.title.x = element_blank())