R/rbsurv.rule.boot.R
In rbsurv: Robust likelihood-based survival modeling with microarray data

'rbsurv.rule.boot' <- function(x.train, y.train, st.train, x.test=NULL, y.test=NULL, st.test=NULL,
                                            n.iter=10, nfold=3, method="efron") {


     ##########
     #Preparation
     n.gene <- ncol(x.train)
     n.sample.train <- nrow(x.train)
     colnames(x.train) <- 1:n.gene

      if(is.null(y.test)==FALSE)
      {
         n.sample.test  <- nrow(x.test)
         colnames(x.test)  <- 1:n.gene
      }

     ##################
     #Initiate
     opt.genes <- c()
     opt.nloglik    <- c()
     opt.AIC  <- c()
     opt.BIC  <- c()

     opt.nloglik.train    <- c()
     opt.AIC.train  <- c()
     opt.BIC.train  <- c()

     ##################
     #Pick 0-gene model
     xx.train <- NA
     xx.test  <- NA

     #Evaluate by train set
     tie <- TRUE; if(length(unique(y.train)) == length(y.train)) tie <- FALSE
     tmp <- option.rbsurv.null(y.train, st.train, y.train, st.train, tie=tie, method=method)
     null.nloglik.train <- tmp$nloglik
     null.AIC.train <- tmp$AIC
     null.BIC.train <- tmp$BIC

     #Evaluate by test set
      if(is.null(y.test)==FALSE)
      {
          tie <- TRUE; if(length(unique(y.test)) == length(y.test)) tie <- FALSE
          tmp <- option.rbsurv.null(y.train, st.train, y.test, st.test, tie=tie, method=method)
          null.nloglik <- tmp$nloglik
          null.AIC  <- tmp$AIC
          null.BIC  <- tmp$BIC
     }


     ##################
     #Pick 1-gene model
     out <- matrix(0, n.iter, n.gene)
     for(i in 1:n.iter) {
        id <- id.sample(st.train, nfold=nfold)
        y.tr <- y.train[id!=1]
        y.te <- y.train[id==1]
        st.tr <- st.train[id!=1]
        st.te <- st.train[id==1]
        tie <- TRUE; if(length(unique(y.te)) == length(y.te)) tie <- FALSE
        for(j in 1:n.gene) {
            x.tr <- data.frame(x.train[id!=1,j, drop=FALSE])
            x.te <- data.frame(x.train[id==1,j, drop=FALSE])
            out[i,j] <- option.rbsurv(x.tr, y.tr, st.tr, x.te, y.te, st.tr,
                                tie=tie, method=method, k=2)$nloglik
        }
     }


     ii <- which(apply(apply(out, 2, is.na), 2, sum) >0)
     if(length(ii) > 0) out[,ii] <- NA
     out.sum <- apply(out, 2, sum, na.rm = TRUE)
     pick.gene <- which.min(out.sum)
     pick.gene <- as.numeric(colnames(x.train)[pick.gene])
     opt.genes <- c(opt.genes, pick.gene)

     x.train.cand <- x.train[,-opt.genes,drop=FALSE]
     x.train.opt  <- data.frame(x.train[,opt.genes,drop=FALSE])
     colnames(x.train.opt) <- opt.genes

     #Evaluate by train set
     xx.train <- data.frame(x.train[,opt.genes,drop=FALSE])
     tie <- TRUE; if(length(unique(y.train)) == length(y.train)) tie <- FALSE
     tmp <- option.rbsurv(xx.train, y.train, st.train, xx.train, y.train, st.train,
                    tie=tie, method=method, k=2)
     opt.nloglik.train <- c(opt.nloglik.train, tmp$nloglik)
     opt.AIC.train <- c(opt.AIC.train, tmp$AIC)
     opt.BIC.train <- c(opt.BIC.train, tmp$BIC)

     #Evaluate by test set
     if(is.null(y.test)==FALSE)
     {
          xx.test  <- data.frame(x.test[,opt.genes,drop=FALSE])
          tie <- TRUE; if(length(unique(y.test)) == length(y.test)) tie <- FALSE
          tmp <- option.rbsurv(xx.train, y.train, st.train, xx.test, y.test, st.test,
                                         tie=tie, method=method, k=2)
          opt.nloglik <- c(opt.nloglik, tmp$nloglik)
          opt.AIC  <- c(opt.AIC, tmp$AIC)
          opt.BIC  <- c(opt.BIC, tmp$BIC)
     }


     ####################
     #Pick k-gene model (k >1)
     i.stop <- 0
     for(jj in 2:(n.gene-1)) {
        n.gene.cand <- n.gene-jj+1
        out <- matrix(0, n.iter, n.gene.cand)
        for(i in 1:n.iter) {
            id <- id.sample(st.train, nfold=nfold)
            y.tr <- y.train[id!=1]
            y.te <- y.train[id==1]
            st.tr <- st.train[id!=1]
            st.te <- st.train[id==1]
            tie <- TRUE; if(length(unique(y.te)) == length(y.te)) tie <- FALSE
            tmp <- -1########
            for(j in 1:n.gene.cand) {
                x.tr <- data.frame(x.train.opt[id!=1,,drop=FALSE], x.train.cand[id!=1,j,drop=FALSE])
                x.te <- data.frame(x.train.opt[id==1,,drop=FALSE], x.train.cand[id==1,j,drop=FALSE])
                out[i,j] <- option.rbsurv(x.tr, y.tr, st.tr, x.te, y.te, st.tr,
                                    tie=tie, method=method, k=2)$nloglik
                #print(j)
                #print(out[i,j])
            }
        }

        ii <- which(apply(apply(out, 2, is.na), 2, sum) >0)
        if(length(ii) >= ncol(out)) break
        if(length(ii) > 0) out[,ii] <- max(out, na.rm=TRUE)+1
        out.sum <- apply(out, 2, sum, na.rm = TRUE)
        if(length(unique(out.sum)) == 1) break #WHY same???

        pick.gene <- which.min(out.sum) #FIXED
        if(length(pick.gene) != 1) next #FIXED
        pick.gene <- as.numeric(colnames(x.train.cand)[pick.gene])
        opt.genes <- c(opt.genes, pick.gene)
        x.train.opt  <- x.train[, opt.genes,drop=FALSE]
        x.train.cand <- x.train[,-opt.genes,drop=FALSE]

        #Evaluate by train set
        xx.train <- x.train[,opt.genes,drop=FALSE]
        tie <- TRUE; if(length(unique(y.train)) == length(y.train)) tie <- FALSE
        tmp <- option.rbsurv(xx.train, y.train, st.train, xx.train, y.train, st.train,
                       tie=tie, method=method, k=2)
        opt.nloglik.train <- c(opt.nloglik.train, tmp$nloglik)
        opt.AIC.train  <- c(opt.AIC.train, tmp$AIC)
        opt.BIC.train  <- c(opt.BIC.train, tmp$BIC)

        #Evaluate by test set
        if(is.null(y.test)==FALSE)
       {
            xx.test  <- x.test[,opt.genes,drop=FALSE]
            tie <- TRUE; if(length(unique(y.test)) == length(y.test)) tie <- FALSE
            tmp <- option.rbsurv(xx.train, y.train, st.train, xx.test,  y.test, st.test,
                                             tie=tie, method=method, k=2)
            opt.nloglik <- c(opt.nloglik, tmp$nloglik)
            opt.AIC  <- c(opt.AIC, tmp$AIC)
            opt.BIC  <- c(opt.BIC, tmp$BIC)
        }

        #stop if min number of classes is less than 2
        if(sum(st.train)*(1-1/nfold) <= length(opt.genes)+1) break

    }

    #######
    #Outputs
    if(is.null(y.test)==TRUE)
    {
        opt.nloglik  <- opt.nloglik.train
        opt.AIC       <- opt.AIC.train
        null.nloglik <-  null.nloglik.train
        null.AIC      <- null.AIC.train
    }

    i <- 1:length(opt.genes)
    final.out <- data.frame(i, opt.genes, opt.nloglik.train, opt.AIC.train, opt.nloglik, opt.AIC)
    colnames(final.out) <- c("Order","Gene","Train.nloglik","Train.AIC", "nloglik","AIC")

    i <- 0
    final.out.null <- data.frame(i, i, null.nloglik.train, null.AIC.train,  null.nloglik, null.AIC)
    colnames(final.out.null) <- c("Order","Gene","Train.nloglik","Train.AIC", "nloglik","AIC")
    final.out <- rbind(final.out.null, final.out)

    return(final.out)

}