R/MP_management.R

## Package: MultiPattern
##
## Functions for managing/bookkeeping for MultiPattern clustering
##


## General package maintenance - import functions from packages
#' @import stats
#' @import utils
#' @import graphics
#' @import Rcssplot
NULL




##' Summary of default settings for new MultiPattern objects
##'
##' See details for a description of each component
##' 
##' num.random: number of random configurations
##' 
##' num.PCs: number of PCA components
##'
##' num.ICs: number of ICA components
##'
##' nmf.rank: number of NMF components
##'
##' nmf.bg: background value added to avoid null rows in NMF
##'
##' rpca.term.delta: used in rpca analysis (smaller than default for speed)
##'
##' clust.k: max number of clusters to use in easyConfig reg and alt
##' configurations
##'
##' subspace.num.random: number of random subspaces
##'
##' subspace.d.random: number/proportion of features for subspace analysis
##' 
##' alpha: exponent for similarity transformation for meta-similarities
##'
##' beta: determines Lbeta distance for meta-similarities
##'
##' subsample.N: number of observations to use in subsample/bootstrap
##'
##' subsample.R: number of subsampling/bootstrap repetitions
##' 
##' @export
MPdefaultSettings = list(
  num.random=60,
  num.PCs=4,
  num.ICs=4,
  nmf.rank=6,
  nmf.bg=0.1,
  rpca.term.delta=1e-3,
  dbscan.intervals=c(0.1, 0.2, 0.3, 0.4),
  clust.k=3,
  subspace.num.random=100,
  subspace.d.random=0.5,
  alpha=0.5,
  beta=2,
  subsample.N=150,
  subsample.R=20    
)




##' Create a basic MP object
##'
##' @param items character vector, specifies names for observations in the multipattern analysis
##' @param data named list with data objects. Specifying this parameter
##' is equivalent to calling MPnew() and then adding data manually with MPaddData()
##' 
##' @export 
MPnew = function(items, data=NULL) {
    
  ## create a blank MultiPattern object
  MP = list(items=items, data=list(), configs=list())
  MP$settings = MPdefaultSettings
  class(MP) = "MultiPattern"
  class(MP$settings) = "MultiPatternSettings"
  
  ## perhaps add datasets into this object
  if (!is.null(data)) {
    MP = MPaddData(MP, data)
  }
  
  MP
}




##' Add a dataset into a MultiPattern configuration object
##' 
##' @param MP an existing MultiPattern object
##' @param data named list with data objects. 
##'
##' @export
MPaddData = function(MP, data) {

  checkArgClass(MP, "MultiPattern")
  checkArgClass(data, "list")
  
  ## check naming of data objects
  datanames = names(data)
  if (is.null(datanames)) {
    stop("Names of data objects cannot be NULL\n")
  }
  badnames = datanames[datanames %in% names(MP$data)]
  if (length(badnames)>0) {
    stop("Duplicate data object names: ", paste(badnames, collapse=", "), "\n")
  }

  ## check data objects are non-empty
  sapply(datanames, function(x) {
    dx = data[[x]]
    if (nrow(dx)==0 | ncol(dx)==0) {
      stop(paste0("data objects cannot be empty (", x, " is empty)\n"))
    }
  })
  
  ## update the list of data objects in this MultiPattern configuration
  MP$data = c(MP$data, data)
  
  MP
}




##' Add one or more cluster configurations into a MultiPattern object
##'
##' The function acceps data.fun or dist.fun as a list. In these cases, the
##' configuration names are given as config.name:[id]. 
##' 
##' @param MP existing MP object
##' @param config.name character, base name for new subspace/distance
##' @param data.name character, name for data component in MP object. Defaults
##' to first dataset declared in MP. 
##' @param preprocess function or feature set, data preprocessing applied before
##' distance function. Can be a single value or a list. Defaults to NULL, which
##' indicates no preprocessing is required. When a function, the function is
##' applied on the dataset prior to computing dissimilarities.
##' When a vector (character or integer), it is interpreted a feature set of the dataset;
##' dissimilarities are then computed on this subspace of the data.
##' @param dist.fun function, distance function. Can be a single value or a list.
##' Defaults to euclidean distance. 
##'
##' @export
MPaddConfig = function(MP, config.name, data.name=names(MP$data)[1],
                       preprocess=NULL, dist.fun=dist.euclidean) {
  
  checkArgClass(MP, "MultiPattern")
  checkArgClass(config.name, "character")
  checkArgClass(data.name, "character")
  
  ## Hard checks for data compatibility
  if (!data.name %in% names(MP$data)) {
    stop("data.name -", data.name, "- not defined in MultiPattern object\n")
  }
  ## Hard check, accept only one of preprocess or dist.fun as a list
  if (class(preprocess)=="list" & class(dist.fun)=="list") {
    stop("only one of preprocess or dist.fun can be a list\n")
  }
  
  ## get names of new configurations.
  ## If inputs specify just one configuration, its name is just config.name.
  ## If inputs specify lists, config names are formated as config.name:[identifier]
  ## Identifiers are from names of the data.fun or dist.fun lists
  now.list = NULL
  if (class(preprocess)=="list") {
    now.list = preprocess
  } else if (class(dist.fun)=="list") {
    now.list = dist.fun
  }
  if (!is.null(now.list)) {
    numnew = length(now.list)
    if (length(now.list)==length(config.name)) {
      newnames = config.name
    } else  {
      if (is.null(names(now.list))) {
        newnames = paste0(config.name, ".", seq_along(now.list))
      } else {
        newnames = paste0(config.name, ".", names(now.list))
      }
    }
  } else {
    newnames = config.name
    dist.fun = list(a=dist.fun)
  }
  
  ## check if any of the new names overlap with existing names
  badnames = newnames[newnames %in% names(MP$configs)]
  if (length(badnames)>0) {        
    stop("Duplicate configuration names: ", paste(badnames, collapse=", "), "\n")
  }
  
  ## create a list with new configurations
  newconfigs = vector("list", length(newnames))
  names(newconfigs) = newnames;
  ## helper function to create an object with details for one analysis
  makeOneConf = function(nam, dat, prep, distfun) {
    ## create a list with all the input data
    aa = list()
    aa$name = nam
    aa$data = dat
    aa$prep = MPmakePrepFunction(prep)
    aa$dist.fun = distfun
    if (class(prep)!="function" & class(prep)!="NULL") {
      aa$info = prep
    }
    aa
  }
  
  ## create configuration objects and add to newconfigs list
  if (class(preprocess)=="list") {
    for (i in seq_along(preprocess)) {
      nowname = newnames[i]
      newconfigs[[nowname]] = makeOneConf(nowname, data.name, preprocess[[i]], dist.fun)
    }        
  } else if (class(dist.fun)=="list") {
    for (i in seq_along(dist.fun)) {            
      nowname = newnames[i]
      newconfigs[[nowname]] = makeOneConf(nowname, data.name, preprocess, dist.fun[[i]])
    }        
  }
  
  ## update MP with new configurations
  MP$configs = c(MP$configs, newconfigs)
  
  MP
}




##' Remove data or configuration components from a MultiPattern object
##' 
##' @param MP a MultiPattern configuration objects
##' @param data character or vector of data identifiers 
##' @param config character or vector of config identifiers 
##'
##' @export
MPremove = function(MP, data=NULL, config=NULL) {
  
  checkArgClass(MP, "MultiPattern")
  
  ## remove configurations
  if (!is.null(config)) {
    checkArgClass(config, "character")
    MP$configs[config] = NULL
  }
  
  ## remove data components
  if (!is.null(data)) {
    ## remove datasets from MP$data
    for (nowd in data) {
      if (nowd %in% names(MP$data)) {
        MP$data[[nowd]] = NULL
      } else {
        warning("Data object ", nowd, " is not in MultiPattern configuration\n");
      }
    }
    
    ## remove analysis configurations from MP$configs
    okconfigs = !sapply(MP$configs, function(x) x$data %in% data)
    MP$configs = MP$configs[okconfigs]
  }
  
  MP
}




##' Change settings encoded in a MultiPattern object
##' 
##' @param MP a MultiPattern configuration object
##' @param settings list with new settings
##' @param warn logical, set TRUE to get warnings if a setting value
##' is not part of the core MultiPattern set (MPdefaultSettings)
##' 
##' Accepted settings names are those defined in MPdefaultSettings.
##' 
##' @export
MPchangeSettings = function(MP, settings = list(), warn=TRUE) {
    
  checkArgClass(MP, "MultiPattern")
  checkArgClass(settings, "list")
  
  ## check that components in settings are allowed
  sn = names(settings)
  noncore = sn[!sn %in% names(MPdefaultSettings)]
  if (length(noncore)>0 & warn) {
    warning("changing non-core setting(s): ", paste(noncore, collapse=", "), "\n",
            call.=FALSE)
  }
  for (nows in names(settings)) {
    MP$settings[[nows]] = settings[[nows]]
  }
  
  MP
}
    



## ###################################################################################
## Functions for adding configurations
## ###################################################################################

##' Add predefined configuration types to a MultiPattern configuration object
##'
##' This function modifies the object MP in the first argument. The primary modifications
##' are new items in the MP$config list. When pca or rpca are specified in the type argument,
##' the function also precomputes these transformations and adds these into the MP$data list. 
##' All configurations relying on pca or rpca transformed data refer to these pre-computed objects.
##' 
##' @param MP MultiPattern configuration object
##' @param data character, names of datasets in MP to use to create suggestions. If NULL, function
##' applies the configurations to all datasets (but see type). If not NULL, function only considers
##' the specified data objects.
##' @param config.prefix character, prefix used in all configuration names
##' @param preprocess.prefix character, a middle-fix used when naming subspaceR configurations
##' @param type character or list, codes for what types of configuration plugins to use. To see all
##' available plugins, use MPlistPlugins(). 
##' shows all the supported configuration types. Detailed descriptions will appear elsewhere.
##' @param preprocess object specifying preprocessing, e.g. vector of features for subspaces
##' 
##' @export
MPeasyConfig = function(MP, data=NULL, config.prefix="",
                        preprocess.prefix="", 
                        type=c("pca", "euclidean", "spearman", "canberra",
                               "manhattan", "hclust", "pam", "dbscan"),
                        preprocess=NULL) {
  
  if (!is.null(data)) {
    data.missing = data[!(data %in% names(MP$data))]
    if (length(data.missing)) {
      stop("Missing data: ", data.missing, "\n")
    }
  }
  if (class(type)=="list") {
    if (!is.null(data)) {
      stop("when type is a list, data must null\n")
    }
    if (length(type)>0) {
      checkNotNull(names(type), "names in a type list")      
      data.type = names(type)
      data.type = data.type[!(data.type %in% names(MP$data))]
      if (length(data.type)>0) {
        stop("Unrecognized datasets: ", paste(data.type, collapse=", "), "\n")
      }      
    }
  }
  
  if (preprocess.prefix != "") {
    preprocess.prefix = paste0(":", preprocess.prefix)
  }
  
  ## standardize the input - into data=NULL and type=list(data=type)
  typelist = list()
  if (is.null(data)) {
    if (class(type)=="list") {
      typelist = type
    } else {
      typelist = setNames(vector("list", length(MP$data)), names(MP$data))
      typelist = lapply(typelist, function(x) {type})
    }
  } else {
    typelist = setNames(vector("list", length(data)), data)
    typelist = lapply(typelist, function(x) {type})
  }
  
  ## Add configurations by applying plugins    
  for (nowd in names(typelist)) {
    nowtypes = tolower(typelist[[nowd]])        
    for (nowtype in nowtypes) {
      plugin.fun = match.fun(paste0(nowtype, ".MultiPatternPlugin"))
      MP = plugin.fun(MP, nowd, config.prefix, preprocess.prefix, preprocess)
    }             
  }    
  
  MP
}
tkonopka/MultiPattern documentation built on May 31, 2019, 3:45 p.m.