R/fuzzy_forest_obj.R
In fuzzyforest: Fuzzy Forests

Documented in fuzzy_forest modplot predict.fuzzy_forest print.fuzzy_forest

#' Fuzzy Forest Object
#'
#' Fuzzy forests returns an object of type
#' fuzzyforest.
#' @export
#' @param feature_list      List of selected features along with variable
#'                          importance measures.
#' @param final_rf          A final random forest fit using the features
#'                          selected by fuzzy forests.
#' @param module_membership Module membership of each feature.
#' @param WGCNA_object      If applicable, output of WGCNA analysis.
#' @param survivor_list     List of features that have survived screening step.
#' @param selection_list    List of features retained at each iteration of
#'                          selection step.
#' @return An object of type fuzzy_forest.
#' @note This work was partially funded by NSF IIS 1251151 and AMFAR 8721SC.
fuzzy_forest <- function(feature_list, final_rf, module_membership,
                         WGCNA_object=NULL, survivor_list, selection_list) {
  out <- list()
  out[[1]] <- feature_list
  out[[2]] <- final_rf
  out[[3]] <- module_membership
  out[[4]] <- module_membership
  out[[5]] <- survivor_list
  out[[6]] <- selection_list
  names(out) <- c("feature_list", "final_rf", "module_membership",
                  "WGCNA_object", "survivor_list", "selection_list")
  class(out) <- "fuzzy_forest"
  return(out)
}


#' Print fuzzy_forest object.
#' Prints output from fuzzy forests algorithm.
#' @export
#' @param x   A fuzzy_forest object.
#' @param ... Additional arguments not in use.
#' @return data.frame with list of selected features and variable
#'          importance measures.
#' @note This work was partially funded by NSF IIS 1251151 and AMFAR 8721SC.
print.fuzzy_forest <- function(x, ...) {
  print(x$feature_list)
  if(!is.null(x$final_rf$test)) {
    if(!is.null(x$final_rf$test$mse)) {
      cat(c("test set error: ", x$final_rf$test$mse[x$final_rf$ntree]))
    }
    if(!is.null(x$final_rf$test$err.rate)) {
      cat(c("test set error: ", x$final_rf$test$err.rate[x$final_rf$ntree]))
    }
  }
}

#' Predict method for fuzzy_forest object.
#' Obtains predictions from fuzzy forest algorithm.
#' @export
#' @param object   A fuzzy_forest object.
#' @param new_data A matrix or data.frame containing new_data.
#'                 Pay close attention to ensure feature names
#'                 match between training set and test set
#'                 data.frame.
#' @param ...      Additional arguments not in use.
#' @return A vector of predictions
#' @examples
#' library(mvtnorm)
#' gen_mod <- function(n, p, corr) {
#'   sigma <- matrix(corr, nrow=p, ncol=p)
#'   diag(sigma) <- 1
#'   X <- rmvnorm(n, sigma=sigma)
#'   return(X)
#' }
#'
#' gen_X <- function(n, mod_sizes, corr){
#'   m <- length(mod_sizes)
#'   X_list <- vector("list", length = m)
#'   for(i in 1:m){
#'     X_list[[i]] <- gen_mod(n, mod_sizes[i], corr[i])
#'   }
#'   X <- do.call("cbind", X_list)
#'   return(X)
#' }
#'
#' err_sd <- .5
#' n <- 500
#' mod_sizes <- rep(25, 4)
#' corr <- rep(.8, 4)
#' X <- gen_X(n, mod_sizes, corr)
#' beta <- rep(0, 100)
#' beta[c(1:4, 76:79)] <- 5
#' y <- X%*%beta + rnorm(n, sd=err_sd)
#' X <- as.data.frame(X)
#'
#' Xtest <- gen_X(n, mod_sizes, corr)
#' ytest <- Xtest%*%beta + rnorm(n, sd=err_sd)
#' Xtest <- as.data.frame(Xtest)
#'
#' cdist <- as.dist(1 - cor(X))
#' hclust_fit <- hclust(cdist, method="ward.D")
#' groups <- cutree(hclust_fit, k=4)
#'
#' screen_c <- screen_control(keep_fraction = .25,
#'                            ntree_factor = 1,
#'                            min_ntree = 250)
#' select_c <- select_control(number_selected = 10,
#'                            ntree_factor = 1,
#'                            min_ntree = 250)
#' \donttest{
#' ff_fit <- ff(X, y, module_membership = groups,
#'              screen_params = screen_c,
#'              select_params = select_c,
#'              final_ntree = 250)
#' #extract variable importance rankings
#' vims <- ff_fit$feature_list
#'
#' #plot results
#' modplot(ff_fit)
#'
#' #obtain predicted values for a new test set
#' preds <- predict(ff_fit, new_data=Xtest)
#'
#' #estimate test set error
#' test_err <- sqrt(sum((ytest - preds)^2)/n)
#' }
#' @seealso \code{\link[fuzzyforest]{ff}},
#'          \code{\link[fuzzyforest]{wff}},
#'          \code{\link[fuzzyforest]{ff.formula}},
#'          \code{\link[fuzzyforest]{wff.formula}}
#' @note This work was partially funded by NSF IIS 1251151 and AMFAR 8721SC.
predict.fuzzy_forest <- function(object, new_data, ...) {
  out <- predict(object$final_rf, new_data)
  return(out)
}

#' Plots relative importance of modules.
#'
#' The plot is designed
#' to depict the size of each module and what percentage of selected
#' features fall into each module.  In particular, it is easy to
#' determine which module is over-represented in the group of selected
#' features.
#' @export
#' @param object   A fuzzy_forest object.
#' @param main Title of plot.
#' @param xlab Title for the x axis.
#' @param ylab Title for the y axis.
#' @param module_labels Labels for the modules.  A data.frame
#'                      or character matrix with first column giving
#'                      the current name of module and second column giving
#'                      the assigned name of each module.
#' @seealso \code{\link[fuzzyforest]{ff}},
#'          \code{\link[fuzzyforest]{wff}},
#'          \code{\link[fuzzyforest]{ff.formula}},
#'          \code{\link[fuzzyforest]{wff.formula}}
#' @note This work was partially funded by NSF IIS 1251151 and AMFAR 8721SC.
modplot <- function(object, main=NULL, xlab=NULL, ylab=NULL,
                              module_labels=NULL) {
  if(is.null(main)) {
    main <- "Module Membership Distribution"
  }
  if(is.null(xlab)) {
   xlab <- "Module"
  }
  if(is.null(ylab)) {
    ylab <- "Percentage of features in module"
  }

  #allows user to supply new names for modules
  if(!is.null(module_labels)) {
    old_labels <- object$module_membership$module
    #module_labels should be re-ordered so that the old labels are in
    #alphabetical order.  This is because factor(old_labels) has levels in
    #alphabetical order. Note that the `labels` below is contains new labels.
    module_labels <- module_labels[order(module_labels[, 1]), ]
    new_labels <- as.character(factor(old_labels, labels=module_labels[, 2]))
    object$module_membership$module <- new_labels

    #Now module labels need to be changed for the table of variable importances.
    select_mods <- as.factor(object$feature_list$module_membership)
    select_module_table <- module_labels[which(module_labels[, 1] %in%
                                        levels(select_mods)), ,drop=FALSE]

    #This line of code may be slightly dangerous depending on where "." is.
    #It should be ok because after removing "." the remaining levels are in
    #alphabetical order.
    if( "." %in% levels(select_mods)) {
      dot_index <- which(levels(select_mods) == ".")
      levels(select_mods)[-dot_index] <- select_module_table[, 2]
      }
      else {
        levels(select_mods) <- select_module_table[, 2]
      }
      object$feature_list$module_membership <- as.character(select_mods)
  }
  mods <- object$module_membership$module
  mod_length <- length(mods)
  mod_tab <- table(mods)
  mod_name <- names(mod_tab)
  feature_list <- object$feature_list$module_membership
  #This line is here in the case that some covariates are not in a module
  mod_feature_list <- feature_list[feature_list != "."]
  #Table showing how many important features are in each selected module.
  imp_feature_tab <- table(mod_feature_list)
  imp_names <- names(imp_feature_tab)
  feature_tab <- rep(0, length(mod_tab))
  names(feature_tab) <- mod_name
  for(i in 1:length(feature_tab)) {
    if(mod_name[i] %in% names(imp_feature_tab)) {
      feature_tab[i] <- imp_feature_tab[which(imp_names == mod_name[i])]
      }
    }
  unimportant_pct <- (mod_tab - feature_tab)/mod_length
  important_pct <- feature_tab/mod_length
  mod_name <- rep(mod_name, 2)
  pct <- c(unimportant_pct, important_pct)
  pct_type <- rep(c("% Unimportant", "% Important"), each=length(mod_tab))
  importance_pct <- data.frame(Module=mod_name, Status=pct_type,
                               Percentage=pct)
  #test whether labels are numeric
  #reorder the labels if they are numeric
  num_mods <- suppressWarnings(as.numeric(object$module_membership[, 2]))
  num_test <- sum(is.na(num_mods))
  if(num_test == 0) {
    importance_pct[, 1] <- as.factor(importance_pct[, 1])
    levels(importance_pct[, 1]) <- sort(unique(num_mods))
  }

  #this is a work-around to get rid of notes in R CMD Check
  #Probably a better way to address the issue
  Module <- NULL
  Percentage <- NULL
  Status <- NULL
  ###############
  imp_plot <- ggplot(importance_pct, aes(x=Module, y=Percentage, fill=Status)) +
              geom_bar(stat="identity") +
              ggtitle(main) + labs(x = xlab, y = ylab) +
              theme(plot.title = element_text(lineheight=.8, face="bold"),
                    legend.title = element_blank())
  plot(imp_plot)
}