R/TDboost.R

Defines functions summary.TDboost permutation.test.TDboost TDboost.loss relative.influence TDboost.perf TDboost TDboost.fit TDboost.more plot.TDboost predict.TDboost .onAttach

Documented in permutation.test.TDboost plot.TDboost predict.TDboost relative.influence summary.TDboost TDboost TDboost.fit TDboost.loss TDboost.more TDboost.perf

# The code is a modified version of gbm library originally written by Greg Ridgeway. See
# 
# Ridgeway, G. (2007). Generalized boosted models: A guide to the gbm package. R pack-
# age vignette. http://cran.r-project.org/web/packages/gbm.

.onAttach <- function(libname, pkgname)
     packageStartupMessage("Loaded TDboost ",
                           utils::packageVersion(pkgname, libname),"\n")


predict.TDboost <- function(object,newdata,n.trees,
                        single.tree = FALSE, type = c("response", "link"),
                        ...)
{
   type <- match.arg(type)
   if(!is.null(object$Terms))
   {
      x <- model.frame(terms(reformulate(object$var.names)),
                       newdata,
                       na.action=na.pass)
   }
   else
   {
      x <- newdata
   }

   cRows <- nrow(x)
   cCols <- ncol(x)

   for(i in 1:cCols)
   {
      if(is.factor(x[,i]))
      {
         j <- match(levels(x[,i]), object$var.levels[[i]])
         if(any(is.na(j)))
         {
            stop(paste("New levels for variable ",
                        object$var.names[i],": ",
                        levels(x[,i])[is.na(j)],sep=""))
         }
         x[,i] <- as.numeric(x[,i])-1
      }
   }

   x <- as.vector(unlist(x))
   if(missing(n.trees) || any(n.trees > object$n.trees))
   {
      n.trees[n.trees>object$n.trees] <- object$n.trees
      warning("Number of trees not specified or exceeded number fit so far. Using ",paste(n.trees,collapse=" "),".")
   }
   i.ntree.order <- order(n.trees)
      
   predF <- .Call("TDboost_pred",
                  X=as.double(x),
                  cRows=as.integer(cRows),
                  cCols=as.integer(cCols),
                  n.trees=as.integer(n.trees[i.ntree.order]),
                  initF=object$initF,
                  trees=object$trees,
                  c.split=object$c.split,
                  var.type=as.integer(object$var.type),
                  single.tree = as.integer(single.tree),
                  PACKAGE = "TDboost")
   
   if(length(n.trees)>1) 
   {
      predF <- matrix(predF,ncol=length(n.trees),byrow=FALSE)
      colnames(predF) <- n.trees
      predF[,i.ntree.order] <- predF
   }
   
   if(!is.null(attr(object$Terms,"offset")))
   {
      warning("predict.TDboost does not add the offset to the predicted values.")
   }

   return(switch(type, link = predF, response = exp(predF)))
}


plot.TDboost <- function(x,
                     i.var=1,
                     n.trees=x$n.trees,
                     continuous.resolution=100,
                     return.grid = FALSE,
                     ...)
{
   if(all(is.character(i.var)))
   {
      i <- match(i.var,x$var.names)
      if(any(is.na(i)))
      {
         stop("Plot variables not used in TDboost model fit: ",i.var[is.na(i)])
      } else
      {
         i.var <- i
      }
   }

   if((min(i.var)<1) || (max(i.var)>length(x$var.names)))
   {
      warning("i.var must be between 1 and ",length(x$var.names))
   }
   if(n.trees > x$n.trees)
   {
      warning(paste("n.trees exceeds the number of trees in the model, ",x$n.trees,
                    ". Plotting using ",x$n.trees," trees.",sep=""))
      n.trees <- x$n.trees
   }

   if(length(i.var) > 3)
   {
     warning("plot.TDboost creates up to 3-way interaction plots.\nplot.TDboost will only return the plotting data structure.")
     return.grid = TRUE
   }

   # generate grid to evaluate TDboost model
   grid.levels <- vector("list",length(i.var))
   for(i in 1:length(i.var))
   {
      # continuous
      if(is.numeric(x$var.levels[[i.var[i]]]))
      {
         grid.levels[[i]] <- seq(min(x$var.levels[[i.var[i]]]),
                                 max(x$var.levels[[i.var[i]]]),
                                 length=continuous.resolution)
      }
      # categorical or ordered
      else
      {
         grid.levels[[i]] <- as.numeric(factor(x$var.levels[[i.var[i]]],
                                               levels=x$var.levels[[i.var[i]]]))-1
      }
   }
   X <- expand.grid(grid.levels)
   names(X) <- paste("X",1:length(i.var),sep="")

   # evaluate at each data point
   X$y <- .Call("TDboost_plot",
                X=as.double(data.matrix(X)),
                cRows=as.integer(nrow(X)),
                cCols=as.integer(ncol(X)),
                i.var=as.integer(i.var-1),
                n.trees=as.integer(n.trees),
                initF=as.double(x$initF),
                trees=x$trees,
                c.splits=x$c.splits,
                var.type=as.integer(x$var.type),
                PACKAGE = "TDboost")

   # transform categorical variables back to factors
   f.factor <- rep(FALSE,length(i.var))
   for(i in 1:length(i.var))
   {
      if(!is.numeric(x$var.levels[[i.var[i]]]))
      {
         X[,i] <- factor(x$var.levels[[i.var[i]]][X[,i]+1],
                         levels=x$var.levels[[i.var[i]]])
         f.factor[i] <- TRUE
      }
   }

   if(return.grid)
   {
      names(X)[1:length(i.var)] <- x$var.names[i.var]
      return(X)
   }


   # create the plots
   if(length(i.var)==1)
   {
      if(!f.factor)
      {
         j <- order(X$X1)
         plot(X$X1,X$y,
              type="l",
              xlab=x$var.names[i.var],
              ylab=paste("f(",x$var.names[i.var],")",sep=""),...)
      }
      else
      {
         plot(X$X1,
              X$y,
              xlab=x$var.names[i.var],
              ylab=paste("f(",x$var.names[i.var],")",sep=""),
              ...)
      }
   }
   else if(length(i.var)==2)
   {
      if(!f.factor[1] && !f.factor[2])
      {
         levelplot(y~X1*X2,data=X,
                     xlab=x$var.names[i.var[1]],
                     ylab=x$var.names[i.var[2]],...)
      }
      else if(f.factor[1] && !f.factor[2])
      {
         xyplot(y~X2|X1,data=X,
                xlab=x$var.names[i.var[2]],
                ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""),
                type="l",
                ...)
      }
      else if(!f.factor[1] && f.factor[2])
      {
         xyplot(y~X1|X2,data=X,
                xlab=x$var.names[i.var[1]],
                ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""),
                type="l",
                ...)
      }
      else
      {
         stripplot(y~X1|X2,data=X,
                   xlab=x$var.names[i.var[2]],
                   ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""),
                   ...)
      }
   }
   else if(length(i.var)==3)
   {
      i <- order(f.factor)
      X.new <- X[,i]
      X.new$y <- X$y
      names(X.new) <- names(X)

      # 0 factor, 3 continuous
      if(sum(f.factor)==0)
      {
         X.new$X3 <- equal.count(X.new$X3)
         levelplot(y~X1*X2|X3,data=X.new,
                     xlab=x$var.names[i.var[i[1]]],
                     ylab=x$var.names[i.var[i[2]]],...)
      }
      # 1 factor, 2 continuous
      else if(sum(f.factor)==1)
      {
         levelplot(y~X1*X2|X3,data=X.new,
                     xlab=x$var.names[i.var[i[1]]],
                     ylab=x$var.names[i.var[i[2]]],...)
      }
      # 2 factors, 1 continuous
      else if(sum(f.factor)==2)
      {
         xyplot(y~X1|X2*X3,data=X.new,
                type="l",
                xlab=x$var.names[i.var[i[1]]],
                ylab=paste("f(",paste(x$var.names[i.var[1:3]],collapse=","),")",sep=""),
                ...)
      }
      # 3 factors, 0 continuous
      else if(sum(f.factor)==3)
      {
         stripplot(y~X1|X2*X3,data=X.new,
                   xlab=x$var.names[i.var[i[1]]],
                   ylab=paste("f(",paste(x$var.names[i.var[1:3]],collapse=","),")",sep=""),
                   ...)
      }
   }
}


TDboost.more <- function(object,
                     n.new.trees = 100,
                     data = NULL,
                     weights = NULL,
                     offset = NULL,
                     verbose = NULL)
{
   if(is.null(object$Terms) && is.null(object$data))
   {
      stop("The TDboost model was fit using TDboost.fit (rather than TDboost) and keep.data was set to FALSE. TDboost.more cannot locate the dataset.")
   }
   else if(is.null(object$data) && is.null(data))
   {
      stop("keep.data was set to FALSE on original TDboost call and argument 'data' is NULL")
   }
   else if(is.null(object$data))
   {
      m <- eval(object$m, parent.frame())
      Terms <- attr(m, "terms")
      a <- attributes(Terms)

      y <- as.vector(model.extract(m, "response"))
      offset <- model.extract(m,"offset")
      x <- model.frame(delete.response(Terms),
                       data,
                       na.action=na.pass)

      w <- weights
      if(length(w)==0) w <- rep(1, nrow(x))
      w <- w*length(w)/sum(w) # normalize to N

      if(is.null(offset) || (offset==0))
      {
         offset <- NA
      }
      Misc <- NA

      # create index upfront... subtract one for 0 based order
      x.order <- apply(x[1:object$nTrain,,drop=FALSE],2,order,na.last=FALSE)-1
      x <- data.matrix(x)
      cRows <- nrow(x)
      cCols <- ncol(x)
   }
   else
   {
      y           <- object$data$y
      x           <- object$data$x
      x.order     <- object$data$x.order
      offset      <- object$data$offset
      Misc        <- object$data$Misc
      w           <- object$data$w
      cRows <- length(y)
      cCols <- length(x)/cRows
   }

   if(is.null(verbose))
   {
      verbose <- object$verbose
   }
   x <- as.vector(x)

   TDboost.obj <- .Call("TDboost",
                    Y = as.double(y),
                    Offset = as.double(offset),
                    X = as.double(x),
                    X.order = as.integer(x.order),
                    weights = as.double(w),
                    Misc = as.double(Misc),
                    cRows = as.integer(cRows),
                    cCols = as.integer(cCols),
                    var.type = as.integer(object$var.type),
                    var.monotone = as.integer(object$var.monotone),
                    distribution = as.character(object$distribution$name),
                    n.trees = as.integer(n.new.trees),
                    interaction.depth = as.integer(object$interaction.depth),
                    n.minobsinnode = as.integer(object$n.minobsinnode),
                    shrinkage = as.double(object$shrinkage),
                    bag.fraction = as.double(object$bag.fraction),
                    nTrain = as.integer(object$nTrain),
                    fit.old = as.double(object$fit),
                    n.cat.splits.old = length(object$c.splits),
                    n.trees.old = as.integer(object$n.trees),
                    verbose = as.integer(verbose),
                    PACKAGE = "TDboost")
   names(TDboost.obj) <- c("initF","fit","train.error","valid.error",
                       "oobag.improve","trees","c.splits")

   TDboost.obj$initF         <- object$initF
   TDboost.obj$train.error   <- c(object$train.error, TDboost.obj$train.error)
   TDboost.obj$valid.error   <- c(object$valid.error, TDboost.obj$valid.error)
   TDboost.obj$oobag.improve <- c(object$oobag.improve, TDboost.obj$oobag.improve)
   TDboost.obj$trees         <- c(object$trees, TDboost.obj$trees)
   TDboost.obj$c.splits      <- c(object$c.splits, TDboost.obj$c.splits)

   # cv.error not updated when using TDboost.more
   TDboost.obj$cv.error      <- object$cv.error

   TDboost.obj$n.trees        <- length(TDboost.obj$trees)
   TDboost.obj$distribution   <- object$distribution
   TDboost.obj$train.fraction <- object$train.fraction
   TDboost.obj$shrinkage      <- object$shrinkage
   TDboost.obj$bag.fraction   <- object$bag.fraction
   TDboost.obj$var.type       <- object$var.type
   TDboost.obj$var.monotone   <- object$var.monotone
   TDboost.obj$var.names      <- object$var.names
   TDboost.obj$interaction.depth <- object$interaction.depth
   TDboost.obj$n.minobsinnode    <- object$n.minobsinnode
   TDboost.obj$nTrain            <- object$nTrain
   TDboost.obj$Terms             <- object$Terms
   TDboost.obj$var.levels        <- object$var.levels
   TDboost.obj$verbose           <- verbose

   if(!is.null(object$data))
   {
      TDboost.obj$data <- object$data
   }
   else
   {
      TDboost.obj$data <- NULL
      TDboost.obj$m <- object$m
   }

   class(TDboost.obj) <- "TDboost"
   return(TDboost.obj)
}


TDboost.fit <- function(x,y,
                    offset = NULL,
                    misc = NULL,
                    distribution = list(name = "EDM", alpha = 1.5),
                    w = NULL,
                    var.monotone = NULL,
                    n.trees = 100,
                    interaction.depth = 1,
                    n.minobsinnode = 10,
                    shrinkage = 0.001,
                    bag.fraction = 0.5,
                    train.fraction = 1.0,
                    keep.data = TRUE,
                    verbose = TRUE,
                    var.names = NULL,
                    response.name = NULL)
{
   cRows <- nrow(x)
   cCols <- ncol(x)
   if(is.null(var.names))
   {
      if(is.matrix(x))
      {
         var.names <- colnames(x)
      }
      else if(is.data.frame(x))
      {
         var.names <- names(x)
      }
      else
      {
         var.names <- paste("X",1:cCols,sep="")
      }
   }
   if(is.null(response.name))
   {
      response.name <- "y"
   }

   # check dataset size
   if(cRows*train.fraction*bag.fraction <= 2*n.minobsinnode+1)
   {
      stop("The dataset size is too small or subsampling rate is too large: cRows*train.fraction*bag.fraction <= n.minobsinnode")
   }

   if(nrow(x) != length(y))
   {
      stop("The number of rows in x does not equal the length of y.")
   }

   if(interaction.depth < 1)
   {
      stop("interaction.depth must be at least 1.")
   }

   if(length(w)==0) w <- rep(1, cRows)
   else if(any(w < 0)) stop("negative weights not allowed")
   w <- w*length(w)/sum(w) # normalize to N

   if(any(is.na(y))) stop("Missing values are not allowed in the response, ",
                          response.name)

   if(is.null(offset) || (offset==0))
   {
      offset <- NA
   }
   else if(length(offset) != length(y))
   {
      stop("The length of offset does not equal the length of y.")
   }
   Misc <- NA

   # setup variable types
   var.type <- rep(0,cCols)
   var.levels <- vector("list",cCols)
   for(i in 1:length(var.type))
   {
      if(all(is.na(x[,i])))
      {
         stop("variable ",i,": ",var.names[i]," has only missing values.")
      }
      if(is.ordered(x[,i]))
      {
         var.levels[[i]] <- levels(x[,i])
         x[,i] <- as.numeric(x[,i])-1
         var.type[i] <- 0
      }
      else if(is.factor(x[,i]))
      {
         if(length(levels(x[,i]))>1024)
            stop("TDboost does not currently handle categorical variables with more than 1024 levels. Variable ",i,": ",var.names[i]," has ",length(levels(x[,i]))," levels.")
         var.levels[[i]] <- levels(x[,i])
         x[,i] <- as.numeric(x[,i])-1
         var.type[i] <- max(x[,i],na.rm=TRUE)+1
      }
      else if(is.numeric(x[,i]))
      {
         var.levels[[i]] <- quantile(x[,i],prob=(0:10)/10,na.rm=TRUE)
      }
      else
      {
         stop("variable ",i,": ",var.names[i]," is not of type numeric, ordered, or factor.")
      }

      # check for some variation in each variable
      if(length(unique(var.levels[[i]])) == 1)
      {
         warning("variable ",i,": ",var.names[i]," has no variation.")
      }
   }

   nTrain <- as.integer(train.fraction*cRows)

   if(is.character(distribution)) distribution <- list(name=distribution)
   if(!("name" %in% names(distribution)))
   {
      stop("The distribution is missing a 'name' component, for example list(name=\"EDM\")")
   }
   supported.distributions <-
      c("EDM")
   # check potential problems with the distributions
   if(!is.element(distribution$name,supported.distributions))
   {
      stop("Distribution ",distribution$name," is not supported")
   }
   if(!is.numeric(y))
   {
      stop("The response ",response.name," must be numeric. Factors must be converted to numeric")
   }
   if(distribution$name == "EDM")
   {
      if(is.null(distribution$alpha))
      {
         stop("For Tweedie models, the distribution parameter must be a list with a parameter 'alpha' indicating the index parameter, for example list(name=\"EDM\",alpha=1.5).")
      } else
      if((distribution$alpha<=1) || (distribution$alpha>2))
      {
         stop("alpha must be in (1,2].")
      }
      Misc <- c(alpha=distribution$alpha)
   }

   # create index upfront... subtract one for 0 based order
   x.order <- apply(x[1:nTrain,,drop=FALSE],2,order,na.last=FALSE)-1

   x <- as.vector(data.matrix(x))
   predF <- rep(0,length(y))
   train.error <- rep(0,n.trees)
   valid.error <- rep(0,n.trees)
   oobag.improve <- rep(0,n.trees)

   if(is.null(var.monotone)) var.monotone <- rep(0,cCols)
   else if(length(var.monotone)!=cCols)
   {
      stop("Length of var.monotone != number of predictors")
   }
   else if(!all(is.element(var.monotone,-1:1)))
   {
      stop("var.monotone must be -1, 0, or 1")
   }
   fError <- FALSE
   TDboost.obj <- .Call("TDboost",
                    Y=as.double(y),
                    Offset=as.double(offset),
                    X=as.double(x),
                    X.order=as.integer(x.order),
                    weights=as.double(w),
                    Misc=as.double(Misc),
                    cRows=as.integer(cRows),
                    cCols=as.integer(cCols),
                    var.type=as.integer(var.type),
                    var.monotone=as.integer(var.monotone),
                    distribution=as.character(distribution$name),
                    n.trees=as.integer(n.trees),
                    interaction.depth=as.integer(interaction.depth),
                    n.minobsinnode=as.integer(n.minobsinnode),
                    shrinkage=as.double(shrinkage),
                    bag.fraction=as.double(bag.fraction),
                    nTrain=as.integer(nTrain),
                    fit.old=as.double(NA),
                    n.cat.splits.old=as.integer(0),
                    n.trees.old=as.integer(0),
                    verbose=as.integer(verbose),
                    PACKAGE = "TDboost")
   names(TDboost.obj) <- c("initF","fit","train.error","valid.error",
                       "oobag.improve","trees","c.splits")

   TDboost.obj$bag.fraction <- bag.fraction
   TDboost.obj$distribution <- distribution
   TDboost.obj$interaction.depth <- interaction.depth
   TDboost.obj$n.minobsinnode <- n.minobsinnode
   TDboost.obj$n.trees <- length(TDboost.obj$trees)
   TDboost.obj$nTrain <- nTrain
   TDboost.obj$response.name <- response.name
   TDboost.obj$shrinkage <- shrinkage
   TDboost.obj$train.fraction <- train.fraction
   TDboost.obj$var.levels <- var.levels
   TDboost.obj$var.monotone <- var.monotone
   TDboost.obj$var.names <- var.names
   TDboost.obj$var.type <- var.type
   TDboost.obj$verbose <- verbose
   TDboost.obj$Terms <- NULL

   if(keep.data)
   {
      TDboost.obj$data <- list(y=y,x=x,x.order=x.order,offset=offset,Misc=Misc,w=w)
   }
   else
   {
      TDboost.obj$data <- NULL
   }

   class(TDboost.obj) <- "TDboost"
   return(TDboost.obj)
}


TDboost <- function(formula = formula(data),
                distribution = list(name = "EDM", alpha = 1.5),
                data = list(),
                weights,
                var.monotone = NULL,
                n.trees = 100,
                interaction.depth = 1,
                n.minobsinnode = 10,
                shrinkage = 0.001,
                bag.fraction = 0.5,
                train.fraction = 1.0,
                cv.folds=0,
                keep.data = TRUE,
                verbose = TRUE)
{
   mf <- match.call(expand.dots = FALSE)    
   m <- match(c("formula", "data", "weights", "offset"), names(mf), 0)
   mf <- mf[c(1, m)]
   mf$drop.unused.levels <- TRUE
   mf$na.action <- na.pass
   mf[[1]] <- as.name("model.frame")
   mf <- eval(mf, parent.frame())
   Terms <- attr(mf, "terms")

   y <- model.response(mf, "numeric")
   w <- model.weights(mf)
   offset <- model.offset(mf)
   
   var.names <- attributes(Terms)$term.labels
   x <- model.frame(terms(reformulate(var.names)),
                    data,
                    na.action=na.pass)

   # get the character name of the response variable
   response.name <- as.character(formula[[2]])
   if(is.character(distribution)) distribution <- list(name=distribution)

   cv.error <- NULL
   if(cv.folds>1)
   {
      i.train <- 1:floor(train.fraction*length(y))
      cv.group <- sample(rep(1:cv.folds, length=length(i.train)))
      cv.error <- rep(0, n.trees)
      for(i.cv in 1:cv.folds)
      {
         if(verbose) cat("CV:",i.cv,"\n")
         i <- order(cv.group==i.cv)
         TDboost.obj <- TDboost.fit(x[i.train,,drop=FALSE][i,,drop=FALSE], 
                            y[i.train][i],
                            offset = offset[i.train][i],
                            distribution = distribution,
                            w = ifelse(w==NULL,NULL,w[i.train][i]),
                            var.monotone = var.monotone,
                            n.trees = n.trees,
                            interaction.depth = interaction.depth,
                            n.minobsinnode = n.minobsinnode,
                            shrinkage = shrinkage,
                            bag.fraction = bag.fraction,
                            train.fraction = mean(cv.group!=i.cv),
                            keep.data = FALSE,
                            verbose = verbose,
                            var.names = var.names,
                            response.name = response.name)
         cv.error <- cv.error + TDboost.obj$valid.error*sum(cv.group==i.cv)
      }
      cv.error <- cv.error/length(i.train)
   }

   TDboost.obj <- TDboost.fit(x,y,
                      offset = offset,
                      distribution = distribution,
                      w = w,
                      var.monotone = var.monotone,
                      n.trees = n.trees,
                      interaction.depth = interaction.depth,
                      n.minobsinnode = n.minobsinnode,
                      shrinkage = shrinkage,
                      bag.fraction = bag.fraction,
                      train.fraction = train.fraction,
                      keep.data = keep.data,
                      verbose = verbose,
                      var.names = var.names,
                      response.name = response.name)
   TDboost.obj$Terms <- Terms
   TDboost.obj$cv.error <- cv.error
   TDboost.obj$cv.folds <- cv.folds

   return(TDboost.obj)
}


TDboost.perf <- function(object,
            plot.it=TRUE,
            oobag.curve=FALSE,
            overlay=TRUE,
            method)
{
   smoother <- NULL

   if((method == "OOB") || oobag.curve)
   {
      if(object$bag.fraction==1)
         stop("Cannot compute OOB estimate or the OOB curve when bag.fraction=1")
      if(all(!is.finite(object$oobag.improve)))
         stop("Cannot compute OOB estimate or the OOB curve. No finite OOB estimates of improvement")
      x <- 1:object$n.trees
      smoother <- loess(object$oobag.improve~x,
                        enp.target=min(max(4,length(x)/10),50))
      smoother$y <- smoother$fitted
      smoother$x <- x

      best.iter.oob <- x[which.min(-cumsum(smoother$y))]
      best.iter <- best.iter.oob
   }

   if(method == "OOB")
   {
      warning("OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv.folds>0 when calling TDboost usually results in improved predictive performance.")
   }

   if(method == "test")
   {
      best.iter.test <- which.min(object$valid.error)
      best.iter <- best.iter.test
   }

   if(method == "cv")
   {
      if(is.null(object$cv.error))
         stop("In order to use method=\"cv\" TDboost must be called with cv.folds>1.")
      if(length(object$cv.error) < object$n.trees)
         warning("cross-validation error is not computed for any additional iterations run using TDboost.more().")
      best.iter.cv <- which.min(object$cv.error)
      best.iter <- best.iter.cv
   }

   if(!is.element(method,c("OOB","test","cv")))
      stop("method must be cv, test, or OOB")

   if(plot.it)
   {
      par(mar=c(5,4,4,4)+.1)
      ylab <- "Exponential Dispersion Loss"
      if(object$train.fraction==1)
      {
         ylim <- range(object$train.error)
      }
      else
      {
         ylim <- range(object$train.error,object$valid.error)
      }

      plot(object$train.error,
           ylim=ylim,
           type="l",
           xlab="Iteration",ylab=ylab)

      if(object$train.fraction!=1)
      {
         lines(object$valid.error,col="red")
      }
      if(method=="cv")
      {
         lines(object$cv.error,col="green")
      }

      if(oobag.curve)
      {
         if(overlay)
         {
            par(new=TRUE)
            plot(smoother$x,
                 cumsum(smoother$y),
                 col="blue",
                 type="l",
                 xlab="",ylab="",
                 axes=FALSE)
            axis(4,srt=0)
            at <- mean(range(smoother$y))
            mtext(paste("OOB improvement in",ylab),side=4,srt=270,line=2)
            abline(h=0,col="blue",lwd=2)
            if(!is.na(best.iter)) abline(v=best.iter,col="blue",lwd=2)
         }

         plot(object$oobag.improve,type="l",
              xlab="Iteration",
              ylab=paste("OOB change in",ylab))
         lines(smoother,col="red",lwd=2)
         abline(h=0,col="blue",lwd=1)

         abline(v=best.iter,col="blue",lwd=1)
      }
   }

   return(best.iter)
}


relative.influence <- function(object,
                               n.trees)
{
   get.rel.inf <- function(obj)
   {
      lapply(split(obj[[6]],obj[[1]]),sum) # 6 - Improvement, 1 - var name
   }

   temp <- unlist(lapply(object$trees[1:n.trees],get.rel.inf))
   rel.inf.compact <- unlist(lapply(split(temp,names(temp)),sum))
   rel.inf.compact <- rel.inf.compact[names(rel.inf.compact)!="-1"]

   # rel.inf.compact excludes those variable that never entered the model
   # insert 0's for the excluded variables
   rel.inf <- rep(0,length(object$var.names))
   i <- as.numeric(names(rel.inf.compact))+1
   rel.inf[i] <- rel.inf.compact

   return(rel.inf=rel.inf)
}

TDboost.loss <- function(y,f,w,offset,dist,baseline)
{
   if(!is.na(offset))
   {
      f <- offset+f
   }
   switch(dist,
          gaussian = weighted.mean((y - f)^2,w) - baseline,
          bernoulli = -2*weighted.mean(y*f - log(1+exp(f)),w) - baseline,
          laplace = weighted.mean(abs(y-f),w) - baseline,
          adaboost = weighted.mean(exp(-(2*y-1)*f),w) - baseline,
          poisson = -2*weighted.mean(y*f-exp(f),w) - baseline,
          stop(paste("Distribution",dist,"is not yet supported for method=permutation.test.TDboost")))
}

permutation.test.TDboost <- function(object,
                                 n.trees)
{
   # get variables used in the model
   i.vars <- sort(unique(unlist(lapply(object$trees[1:n.trees],
                                       function(x){unique(x[[1]])}))))
   i.vars <- i.vars[i.vars!=-1] + 1
   rel.inf <- rep(0,length(object$var.names))

   if(!is.null(object$data))
   {
      y           <- object$data$y
      os          <- object$data$offset
      Misc        <- object$data$Misc
      w           <- object$data$w
      x <- matrix(object$data$x,ncol=length(object$var.names))
      object$Terms <- NULL # this makes predict.TDboost take x as it is
   }
   else
   {
      stop("Model was fit with keep.data=FALSE. permutation.test.TDboost has not been implemented for that case.")
   }

   # the index shuffler
   j <- sample(1:nrow(x))
   for(i in 1:length(i.vars))
   {
      x[ ,i.vars[i]]  <- x[j,i.vars[i]]

      new.pred <- predict.TDboost(object,newdata=x,n.trees=n.trees)
      rel.inf[i.vars[i]] <- TDboost.loss(y,new.pred,w,os,
                                     object$distribution$name,
                                     object$train.error[n.trees])

      x[j,i.vars[i]] <- x[ ,i.vars[i]]
   }

   return(rel.inf=rel.inf)
}


summary.TDboost <- function(object,
                        cBars=length(object$var.names),
                        n.trees=object$n.trees,
                        plotit=TRUE,
                        order=TRUE,
                        method=relative.influence,
                        normalize=TRUE,
                        ...)
{
   if(n.trees < 1)
   {
      stop("n.trees must be greater than 0.")
   }
   if(n.trees > object$n.trees)
   {
      warning("Exceeded total number of TDboost terms. Results use n.trees=",object$n.trees," terms.\n")
      n.trees <- object$n.trees
   }

   rel.inf <- method(object,n.trees)
   rel.inf[rel.inf<0] <- 0

   if(order)
   {
      i <- order(-rel.inf)
   }
   else
   {
      i <- 1:length(rel.inf)
   }
   if(cBars==0) cBars <- min(10,length(object$var.names))
   if(cBars>length(object$var.names)) cBars <- length(object$var.names)

   if(normalize) rel.inf <- 100*rel.inf/sum(rel.inf)

   if(plotit)
   {
      barplot(rel.inf[i[cBars:1]],
              horiz=TRUE,
              col=rainbow(cBars,start=3/6,end=4/6),
              names=object$var.names[i[cBars:1]],
              xlab="Relative influence",...)
   }
   return(data.frame(var=object$var.names[i],
                     rel.inf=rel.inf[i]))
}
emeryyi/tdboost documentation built on Aug. 16, 2022, 7:17 a.m.