R/analysis.R

Defines functions get.AIC fit.analysis multi.cluster

Documented in fit.analysis get.AIC multi.cluster

#' Wrapper for cluster.method functions
#' @param obj:  S3 object of class clusData
#' @param param.list:  list, each entry should be a list of threshold values
#'                     to pass to the clustering method
#' @param cluster.method:  function that returns data frame of known cases
#'                         annotated with cluster ID and growth
#' @param rangeID:  integer, unique index for rows generated by this function
#' @return list
#' @export
multi.cluster <- function(obj, param.list, cluster.method, rangeID=0) {
  # check that param.list has required inputs
  args <- formals(cluster.method)
  required <- names(args)[sapply(args, is.name)]
  required <- required[which(required!="obj")]
  if (!all(is.element(required, names(param.list[[1]])))) {
    stop("param.list missing one or more required parameters")
  }
  
  cluster.range <- lapply(1:length(param.list), function(i) {
    pl <- param.list[[i]]
    pl$setID <- i
    pl$obj <- obj
    suppressWarnings(do.call(cluster.method, pl))
  })
  cluster.range <- do.call(rbind, cluster.range)
  suppressWarnings(cluster.range[, "RangeID" := rangeID])
  
  return(cluster.range)
}


#' Predictive analysis on clusters
#'
#' Fits predictive model of some outcome (by default, cluster growth) to some 
#' cluster-level variable (by default, cluster size). This fit is done for each 
#' cluster set. Multiple models can be inputted as a named list of functions taking 
#' in cluster data (see example)
#'
#' @param cluster.data:  data.table, Inputted set(s) of clusters. Possibly 
#'                       multiple ranges.  The following columns are required:
#'                       Size: The number of sequences in clusters, not 
#'                             including new growth sequences.
#'                       Growth: The number of new sequences added to the 
#'                               cluster.
#'                       SetID: unique identifier for a set of clusters 
#'                              (obtained under given criteria)
#'                       Optionally, the user may also specify RangeID to 
#'                       uniquely identify a set of clusterings.
#' @param transforms:  list, A named list of transformation functions for each 
#'                     predictor variable, *e.g.*, `list("Data"==sum)`. Because 
#'                     clustered meta data takes the form of a list these 
#'                     functions are often necessary to obtain a single, 
#'                     cluster-level variable.  Typical functions include 
#'                     `mean` and `median`.
#' @param models:  list, A named list of functions, each of which applies a 
#'                 `glm` regression model to the cluster data.  The outcome 
#'                 variable should be `Growth` and the `family` argument should 
#'                 be "poisson", *e.g.*: `list("NullModel"=function(x) 
#'                 glm(Growth~Size, data=x, family="poisson"))
#'                 You may also use `glm.nb` from the `MASS` package to fit a 
#'                 negative binomial regression for overdispersion.
#' @return list, each entry labelled with SetID (to link back to the parameter 
#'         list).  Entries contain S3 objects of class "glm" or "lm".
#' @export
fit.analysis <- function(cluster.data, transforms, models) {
  # Check inputs
  predictors <- names(transforms)
  mod.names <- names(models)
  
  setIDs <- unique(cluster.data[, SetID])
  if (!all((predictors) %in% colnames(cluster.data))) {
    stop("Predictors referenced in transform step are not in the range of ",
    "cluster data")
  }
  if (!("RangeID" %in% colnames(cluster.data))) {
    warning("No range ID, by default this will be set to 0 for all sets")
    cluster.data[, "RangeID" := 0]
  }

  # Transform cluster data for modelling based on inputs
  model.data <- cluster.data[, c("Header", "Size", "Growth", "SetID", "RangeID")]
  
  # append predictor variables and apply transformations
  if(!is.null(predictors)) {
    model.data[, (predictors) := lapply(predictors, function(x) {
      sapply(cluster.data[, get(x)], function(z) {
        (transforms[[x]])(z)
      })
    })]
  }

  # Obtain fit data for each cluster set
  model.fits <- lapply(setIDs, function(sid) {
    lapply(models, function(pmod) {
      suppressWarnings(list(pmod(model.data[SetID == sid, ])))
    })  # this could be run in parallel
  })
  cluster.analysis <- data.table::data.table(
    SetID=setIDs, 
    RangeID=sapply(split(model.data$RangeID, model.data$SetID), function(x) x[1])
  )
  # append model fits
  for (mod in 1:length(mod.names)) {
    cluster.analysis[[mod.names[mod]]] <- sapply(model.fits, function(x) x[[mod]])
  }
  
  return(cluster.analysis)
}


#' Get AIC values from an analysis
#'
#' Takes a cluster.analysis and extracts AIC values from columns containing model 
#' fits. Fit columns are automatically identified
#'
#' @param cluster.analysis:  data.table, from some predictive growth model analysis 
#'  generated by fit.analysis()
#' @param param.list: list, clustering thresholds used to generate cluster
#' data with multi.cluster()
#' @return The AIC data for all columns containing fit objects. The column specifying 
#'  setID is retained
#' @export
get.AIC <- function(cluster.analysis, param.list){
  # Identify which columns in cluster.analysis holds model fits
  which.models <- sapply(cluster.analysis[1,], function(x){
    any(attr(x[[1]], "class") %in% c("lm", "glm"))
    })
  which.models <- which(which.models)
  if (length(which.models)==0) {
    stop("No fits in the data set provided")
  }
  model.fits <- cluster.analysis[ , ..which.models]
  
  result <- data.frame(
    SetID=cluster.analysis$SetID,
    RangeID=cluster.analysis$RangeID
    )
  
  # append parameter values
  # FIXME: it would be safer to actually retrieve these by SetID
  pnames <- names(param.list[[1]])
  for (pn in pnames) {
    result[[pn]] <- sapply(param.list, function(x) x[[pn]])
  }

  # extract AIC values from lm/glm objects into data frame
  temp <- apply(model.fits, 1, function(row) sapply(row, function(x) x$aic))
  result <- cbind(result, t(temp))
  
  return(result)
}
PoonLab/clustuneR documentation built on Jan. 29, 2024, 2:40 a.m.