#' Wrapper for cluster.method functions
#' @param obj: S3 object of class clusData
#' @param param.list: list, each entry should be a list of threshold values
#' to pass to the clustering method
#' @param cluster.method: function that returns data frame of known cases
#' annotated with cluster ID and growth
#' @param rangeID: integer, unique index for rows generated by this function
#' @return list
#' @export
multi.cluster <- function(obj, param.list, cluster.method, rangeID=0) {
# check that param.list has required inputs
args <- formals(cluster.method)
required <- names(args)[sapply(args, is.name)]
required <- required[which(required!="obj")]
if (!all(is.element(required, names(param.list[[1]])))) {
stop("param.list missing one or more required parameters")
}
cluster.range <- lapply(1:length(param.list), function(i) {
pl <- param.list[[i]]
pl$setID <- i
pl$obj <- obj
suppressWarnings(do.call(cluster.method, pl))
})
cluster.range <- do.call(rbind, cluster.range)
suppressWarnings(cluster.range[, "RangeID" := rangeID])
return(cluster.range)
}
#' Predictive analysis on clusters
#'
#' Fits predictive model of some outcome (by default, cluster growth) to some
#' cluster-level variable (by default, cluster size). This fit is done for each
#' cluster set. Multiple models can be inputted as a named list of functions taking
#' in cluster data (see example)
#'
#' @param cluster.data: data.table, Inputted set(s) of clusters. Possibly
#' multiple ranges. The following columns are required:
#' Size: The number of sequences in clusters, not
#' including new growth sequences.
#' Growth: The number of new sequences added to the
#' cluster.
#' SetID: unique identifier for a set of clusters
#' (obtained under given criteria)
#' Optionally, the user may also specify RangeID to
#' uniquely identify a set of clusterings.
#' @param transforms: list, A named list of transformation functions for each
#' predictor variable, *e.g.*, `list("Data"==sum)`. Because
#' clustered meta data takes the form of a list these
#' functions are often necessary to obtain a single,
#' cluster-level variable. Typical functions include
#' `mean` and `median`.
#' @param models: list, A named list of functions, each of which applies a
#' `glm` regression model to the cluster data. The outcome
#' variable should be `Growth` and the `family` argument should
#' be "poisson", *e.g.*: `list("NullModel"=function(x)
#' glm(Growth~Size, data=x, family="poisson"))
#' You may also use `glm.nb` from the `MASS` package to fit a
#' negative binomial regression for overdispersion.
#' @return list, each entry labelled with SetID (to link back to the parameter
#' list). Entries contain S3 objects of class "glm" or "lm".
#' @export
fit.analysis <- function(cluster.data, transforms, models) {
# Check inputs
predictors <- names(transforms)
mod.names <- names(models)
setIDs <- unique(cluster.data[, SetID])
if (!all((predictors) %in% colnames(cluster.data))) {
stop("Predictors referenced in transform step are not in the range of ",
"cluster data")
}
if (!("RangeID" %in% colnames(cluster.data))) {
warning("No range ID, by default this will be set to 0 for all sets")
cluster.data[, "RangeID" := 0]
}
# Transform cluster data for modelling based on inputs
model.data <- cluster.data[, c("Header", "Size", "Growth", "SetID", "RangeID")]
# append predictor variables and apply transformations
if(!is.null(predictors)) {
model.data[, (predictors) := lapply(predictors, function(x) {
sapply(cluster.data[, get(x)], function(z) {
(transforms[[x]])(z)
})
})]
}
# Obtain fit data for each cluster set
model.fits <- lapply(setIDs, function(sid) {
lapply(models, function(pmod) {
suppressWarnings(list(pmod(model.data[SetID == sid, ])))
}) # this could be run in parallel
})
cluster.analysis <- data.table::data.table(
SetID=setIDs,
RangeID=sapply(split(model.data$RangeID, model.data$SetID), function(x) x[1])
)
# append model fits
for (mod in 1:length(mod.names)) {
cluster.analysis[[mod.names[mod]]] <- sapply(model.fits, function(x) x[[mod]])
}
return(cluster.analysis)
}
#' Get AIC values from an analysis
#'
#' Takes a cluster.analysis and extracts AIC values from columns containing model
#' fits. Fit columns are automatically identified
#'
#' @param cluster.analysis: data.table, from some predictive growth model analysis
#' generated by fit.analysis()
#' @param param.list: list, clustering thresholds used to generate cluster
#' data with multi.cluster()
#' @return The AIC data for all columns containing fit objects. The column specifying
#' setID is retained
#' @export
get.AIC <- function(cluster.analysis, param.list){
# Identify which columns in cluster.analysis holds model fits
which.models <- sapply(cluster.analysis[1,], function(x){
any(attr(x[[1]], "class") %in% c("lm", "glm"))
})
which.models <- which(which.models)
if (length(which.models)==0) {
stop("No fits in the data set provided")
}
model.fits <- cluster.analysis[ , ..which.models]
result <- data.frame(
SetID=cluster.analysis$SetID,
RangeID=cluster.analysis$RangeID
)
# append parameter values
# FIXME: it would be safer to actually retrieve these by SetID
pnames <- names(param.list[[1]])
for (pn in pnames) {
result[[pn]] <- sapply(param.list, function(x) x[[pn]])
}
# extract AIC values from lm/glm objects into data frame
temp <- apply(model.fits, 1, function(row) sapply(row, function(x) x$aic))
result <- cbind(result, t(temp))
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.