R/grridgeCVlin.R

.grridgeCVlin <- function (grr, highdimdata, response, outerfold = length(response), 
          fixedfolds = TRUE, recalibrate = FALSE) {
  #grl <- .grridgelin(highdimdata=simdata, response=Y, partitions=part5, unpenal = ~1, innfold=10,selectionEN=TRUE)
  #grr <- grl; outerfold=3;highdimdata=simdata; response=Y;fixedfolds=TRUE;recalibrate=FALSE
  model <- grr$model
  arg <- grr$arguments
  if (model == "survival") 
    allobstimes <- sort(response[, 1])
  balance <- FALSE
  nsam <- ncol(highdimdata)
  
  if(arg$standardizeX){
    print("Covariates are standardized")
    sds <- apply(highdimdata,1,sd)
    sds2 <- sapply(sds,function(x) max(x,10^{-5}))
    highdimdata <- (highdimdata-apply(highdimdata,1,mean))/sds2 
  }
  
  if (is.null(colnames(highdimdata))) {
    print("No sample names available. Creating names S1, ..., Sn.")
    cnames <- sapply(1:nsam, function(i) paste("S", i, sep = ""))
    colnames(highdimdata) <- cnames
  }
  if (outerfold < 10 & model != "linear") {
    print("Stratified splits for events applied")
    balance <- TRUE
  }
  if (fixedfolds) 
    set.seed(3534) else set.seed(NULL)
  if (!balance) {
    rand <- sample(1:nsam)
    grs1 <- floor(nsam/outerfold)
    grs2 <- grs1 + 1
    ngr1 <- outerfold * grs2 - nsam
    folds <- lapply(1:outerfold, function(xg) {
      if (xg <= ngr1) 
        els <- rand[(1 + (xg - 1) * grs1):(xg * grs1)]
      else els <- rand[(ngr1 * grs1 + 1 + (xg - ngr1 - 
                                             1) * grs2):(ngr1 * grs1 + (xg - ngr1) * grs2)]
      return(els)
    })
  } else {
    if (model == "logistic") 
      if (class(response) == "factor") 
        nev <- which((as.numeric(response) - 1) == 1)
      else nev <- which(response == 1)
      if (model == "survival") 
        nev <- which(response[, 1] == 1)
      nsamev <- length(nev)
      randev <- sample(nev)
      grs1 <- floor(nsamev/outerfold)
      grs2 <- grs1 + 1
      ngr1 <- outerfold * grs2 - nsamev
      foldsev <- lapply(1:outerfold, function(xg) {
        if (xg <= ngr1) 
          els <- randev[(1 + (xg - 1) * grs1):(xg * grs1)]
        else els <- randev[(ngr1 * grs1 + 1 + (xg - ngr1 - 
                                                 1) * grs2):(ngr1 * grs1 + (xg - ngr1) * grs2)]
        return(els)
      })
      nonev <- setdiff(1:nsam, nev)
      nsamnonev <- length(nonev)
      randnonev <- sample(nonev)
      grs1 <- floor(nsamnonev/outerfold)
      grs2 <- grs1 + 1
      ngr1 <- outerfold * grs2 - nsamnonev
      foldsnonev <- lapply(1:outerfold, function(xg) {
        if (xg <= ngr1) 
          els <- randnonev[(1 + (xg - 1) * grs1):(xg * 
                                                    grs1)]
        else els <- randnonev[(ngr1 * grs1 + 1 + (xg - ngr1 - 
                                                    1) * grs2):(ngr1 * grs1 + (xg - ngr1) * grs2)]
        return(els)
      })
      folds <- lapply(1:outerfold, function(i) c(foldsev[[i]], 
                                                 foldsnonev[[i]]))
  }
  if (!(is.null(grr$arguments$dataunpen)) & recalibrate == 
      TRUE) {
    recalibrate <- FALSE
    print("Recalibration currently only feasible for designs without unpenalized covariates. Recalibrate set to FALSE")
  }
  if (model == "survival" & recalibrate == TRUE) {
    recalibrate <- FALSE
    print("Recalibration currently only feasible for linear and binary response. Recalibrate set to FALSE")
  }
  ntest <- length(folds[[1]])
  if (ntest < 25 & recalibrate == TRUE) {
    recalibrate <- FALSE
    print("Test sample size too small for recalibration. Need at least 25 test samples. Recalibrate set to FALSE")
  }
  whichfold <- rep(NA, nsam)
  linpreds <- c()
  preds <- c()
  responsesam <- c()
  predsbres <- c()
  for (k in 1:length(folds)) {
    #k<-1
    print(paste("Fold nr:", k))
    samout <- folds[[k]]
    whichfold[samout] <- k
    nout <- length(samout)
    nin <- length(response)-nout
    highdimdatamin <- highdimdata[, -samout]
    responsemin <- response[-samout]
    dataunpenmin <- arg$dataunpen[-samout, , drop = FALSE]
    offsarg <- arg$offset
    unpenal = arg$unpenal
    
    dataunpenout <- arg$dataunpen[samout, , drop = FALSE]
    if((is.null(dataunpenout)) | (unpenal == ~0) | (unpenal == ~1)) {
      mmout <- NULL
    } else  {
      mmout <- model.matrix(unpenal,dataunpenout)
      if(prod(mmout[,1]==rep(1,nout))==1) {
        mmout <- mmout[,-1,drop=FALSE]
      }
    }

    if (length(offsarg) > 1) offsmin <- offsarg[-samout] else offsmin <- rep(0,nin)
    grmin <- .grridgelin(highdimdatamin, responsemin, partitions = arg$partitions, 
                     unpenal = arg$unpenal, offset = offsmin, method = arg$method, 
                     niter = arg$niter, monotone = arg$monotone, optl = arg$optl, 
                     innfold = arg$innfold, fixedfoldsinn = arg$fixedfoldsinn, 
                     maxsel = arg$maxsel, cvlmarg = arg$cvlmarg, dataunpen = dataunpenmin, 
                     savepredobj = arg$savepredobj, ord = arg$ord, comparelasso = arg$comparelasso, 
                     optllasso = arg$optllasso, selectionEN = arg$selectionEN, 
                     compareunpenal = arg$compareunpenal, modus = arg$modus, EBlambda=arg$EBlambda, standardizeX = FALSE)
    penobj <- grmin$predobj
    Xsam <- t(highdimdata[, samout, drop = FALSE])
    optl<-grmin$arguments$optl
    optllasso <- grmin$arguments$optllasso
    if (length(offsarg) > 1) offsout <- offsarg[samout] else offsout <- rep(0,nout)
    
    responsesam <- c(responsesam, samout)
    responseout <- response[samout]
    predellall <- c()
    predellall2 <- c()
    linpredall <- c()
    if (is.null(penobj)) {
      cat("No prediction objection available. Run grridge using either savepredobj=\"last\" or savepredobj=\"all\"\n")
      return(NULL)
    }

    nmp <- names(penobj)
    if (model == "logistic" | model == "linear") {
      npreds <- length(penobj)
      if (arg$comparelasso) 
        npreds <- npreds - 1
      if (arg$compareunpenal) 
        npreds <- npreds - 1
      if (arg$selectionEN) npredsEN <- length(arg$maxsel) else npredsEN <- 0
        npreds <- npreds - npredsEN
      if (npreds > 0) {
        for (ell in 1:npreds) {
          #ell<-1
          predobj <- penobj[[ell]]
          lmvecall <- grmin$lambdamultvec[, ell]
          Xsamw <- t(t(Xsam)/sqrt(lmvecall))
          
          predell <- try(as.numeric(predict(predobj,cbind(Xsamw,mmout),s=c(optl),offset=offsout,type="response")),silent=T)
          if(class(predell) == "try-error") predell <- as.numeric(predict(predobj,cbind(Xsamw,mmout),s=c(optl),newoffset=offsout,type="response"))
          
          predellall <- cbind(predellall, predell)
          if (recalibrate) {
            datlp <- Xsamw %*% predobj$beta
            if (model == "logistic") 
              refitmod <- glm(responseout ~ 1 + datlp, 
                              family = "binomial")
            else refitmod <- glm(responseout ~ 1 + datlp, 
                                 family = "gaussian")
            print(paste("Recalibration intercept and slope for (gr)-ridge model", 
                        ell, ":", round(refitmod$coefficients[1], 
                                        3), ";", round(refitmod$coefficients[2], 
                                                       3)))
            predell2 <- predict(refitmod, type = "response")
            predellall2 <- cbind(predellall2, predell2)
          }
        }
      }
      if (arg$comparelasso) {
        take <- npreds + 1
        predobj <- penobj[[take]]
        
        predell <- try(as.numeric(predict(predobj,cbind(Xsam,mmout),s=c(optllasso),offset=offsout, type="response")), silent=T)
        if(class(predell) == "try-error") predell <- as.numeric(predict(predobj,cbind(Xsam,mmout),s=c(optllasso),newoffset=offsout,type="response"))
        
        predellall <- cbind(predellall, predell)
        if (recalibrate) {
          datlp <- Xsam %*% predobj$beta
          if (model == "logistic") 
            refitmod <- glm(responseout ~ 1 + datlp, 
                            family = "binomial")
          else refitmod <- glm(responseout ~ 1 + datlp, 
                               family = "gaussian")
          print(paste("Recalibration intercept and slope for lasso model:", 
                      round(refitmod$coefficients[1], 3), ";", 
                      round(refitmod$coefficients[2], 3)))
          predell2 <- predict(refitmod, type = "response")
          predellall2 <- cbind(predellall2, predell2)
        }
      }
      if (arg$selectionEN) {
        nadd <- arg$comparelasso
        toadd <- npreds + nadd
        for (ell in 1:npredsEN) {
        predobj <- penobj[[toadd+ell]]
        nc <- ncol(grmin$lambdamultvec)
        lmvecall <- grmin$lambdamultvec[, nc]
        Xsamw <- t(t(Xsam)/sqrt(lmvecall))
        whEN <- grmin$resEN[[ell]]$whichEN
        Xsamw <- Xsamw[, whEN, drop = FALSE]
        
        predell <- try(as.numeric(predict(predobj,cbind(Xsamw,mmout),s=c(optl),offset=offsout,type="response")),silent=T)
        if(class(predell) == "try-error") predell <- as.numeric(predict(predobj,cbind(Xsamw,mmout),s=c(optl),newoffset=offsout,type="response"))
        
        
        predellall <- cbind(predellall, predell)
        if (recalibrate) {
          datlp <- Xsamw %*% predobj@penalized
          if (model == "logistic") 
            refitmod <- glm(responseout ~ 1 + datlp, 
                            family = "binomial")
          else refitmod <- glm(responseout ~ 1 + datlp, 
                               family = "gaussian")
          print(paste("Recalibration intercept and slope for EN model", 
                      ell, ":", round(refitmod$coefficients[1], 
                                      3), ";", round(refitmod$coefficients[2], 
                                                     3)))
          predell2 <- predict(refitmod, type = "response")
          predellall2 <- cbind(predellall2, predell2)
        }
        } #end for ell
      }
      #TODO
      if (arg$compareunpenal) {
        nadd <- arg$comparelasso +  npredsEN
        take <- npreds + nadd + 1
        predobj <- penobj[[take]]
        predell <- predict(predobj, newdata = data.frame(mmout), #bug repaired 19/12/2016: data -> newdata
                           type = "response")[1:nout]
        predellall <- cbind(predellall, predell)
      }
      print(paste("Sample(s) left out:", samout))
      print("True:")
      print(response[samout])
      print("Prediction(s):")
      colnames(predellall) <- nmp
      if (recalibrate) {
        colnames(predellall2) <- paste(nmp, "_recalib", 
                                       sep = "")
        predellall <- cbind(predellall, predellall2)
      }
      print(data.frame(response = response[samout], predellall))
      preds <- rbind(preds, predellall)
    }
  } #end for loop
    
  if (model == "logistic" | model == "linear") {
    sams <- unlist(folds)
    od <- order(sams)
    mat <- data.frame(response[responsesam], preds)[od, ]
    mat <- data.frame(mat, whichfold)
    colnames(mat) <- c("TrueResponse", colnames(preds), "whichfold")
    return(mat)
  }
}
markvdwiel/GRridge documentation built on May 21, 2019, 12:25 p.m.