R/simplica.R

Defines functions simplica

Documented in simplica

#' SIMPLICA: Simultaneous Identification of Simplivariate Components
#'
#' Implements the SIMPLICA algorithm to identify Simplivariate Components 
#' in data matrices using a genetic algorithm. These components are related to 
#' clusters or biclusters, but defined here in terms of specific structural 
#' patterns (constant, additive, multiplicative, or user-defined).
#'
#' @param df A numeric data matrix to analyze
#' @param maxIter Maximum number of generations for the genetic algorithm (default: 2000)
#' @param popSize Population size for the genetic algorithm (default: 300)
#' @param pCrossover Crossover probability for genetic algorithm (default: 0.6)
#' @param pMutation Mutation probability for genetic algorithm (default: 0.03)
#' @param zeroFraction Fraction of population initialized with zeros (default: 0.9)
#' @param elitism Number of best individuals preserved between generations (default: 100)
#' @param numSimComp Number of Simplivariate Components simultaneously optimized (default: 5)
#' @param verbose Logical, whether to print SIMPLICA progress information (default: FALSE)
#' @param mySeeds Vector of random seeds for replicate runs (default: 1:5)
#' @param interval Interval for monitoring GA progress (default: 100)
#' @param penalty Named vector of penalty values for each pattern type (default: c(constant = 0, additive = 1, multiplicative = 0))
#' @param patternFunctions List of pattern functions used for fitness evaluation (default: defaultPatternFunctions())
#' @param doSimplicaCV Logical, run cross-validated relabeling with simplicaCV() after GA (default: TRUE)
#' @param cvControl Optional list to tune simplicaCV; fields passed to simplicaCV via do.call.
#'   Defaults if omitted:
#'   \itemize{
#'     \item patternFitters = defaultPatternFitters()
#'     \item preferenceOrder = names(patternFunctions)
#'     \item nRepeats = 40
#'     \item testFraction = 0.2
#'     \item minCellsForModels = 25
#'     \item parsimonyMargin = 0.05
#'     \item requireFitters = TRUE
#'     \item updateObject = TRUE
#'     \item verbose = verbose
#'   }
#'
#' @return A list with:
#' \itemize{
#'   \item best: simplica object (includes original GA result; if doSimplicaCV=TRUE, also componentPatternsUpdated and componentAudit)
#'   \item raw: list of \code{"ga"} objects (one per seed, from the GA package)
#' }
#' 
#' @references
#' Hageman, J. A., Wehrens, R., & Buydens, L. M. C. (2008). "Simplivariate Models: Ideas and First Examples." PLoS ONE, 3(9), e3259. \doi{10.1371/journal.pone.0003259}
#'
#' Madeira, S. C., & Oliveira, A. L. (2004). "Biclustering Algorithms for Biological Data Analysis: A Survey." IEEE/ACM Transactions on Computational Biology and Bioinformatics, 1(1), 24–45. \doi{10.1109/TCBB.2004.2}
#' 
#' @examples
#' \donttest{
#' data("simplicaToy")
#' # Minimal run just to demonstrate function usage, run with default GA parameters
#' fit <- simplica(df = simplicaToy$data, 
#'                 maxIter = 200,
#'                 popSize = 50,
#'                 mySeeds = 1,
#'                 elitism = 1,
#'                 verbose = TRUE)
#' plotComponentResult(df = simplicaToy$data,
#'                     string            = fit$best$string,
#'                     componentPatterns = fit$best$componentPatternsUpdated,
#'                     componentScores   = fit$best$componentScores,
#'                     showAxisLabels    = FALSE,
#'                     title             = "SIMPLICA on simplicaToy",
#'                     scoreCutoff       = 25000)
#' }
#'
#' @importFrom stats setNames
#' @export
simplica <- function(df,
                     maxIter = 2000,
                     popSize = 300,
                     pCrossover = 0.6,
                     pMutation = 0.03,
                     zeroFraction = 0.9,
                     elitism = 100,
                     numSimComp = 5,
                     verbose = FALSE,
                     mySeeds = 1:5,
                     interval = 100,
                     penalty = c(constant = 0, additive = 1, multiplicative = 0),
                     patternFunctions = defaultPatternFunctions(),
                     doSimplicaCV = TRUE,
                     cvControl = NULL) {

  if (!requireNamespace("GA", quietly = TRUE)) stop("Please install 'GA'.")

  if (!(is.matrix(df) || is.data.frame(df))) {
    stop("df must be a numeric matrix or data.frame.")
  }
  df <- as.matrix(df)
  if (!is.numeric(df)) stop("df must be numeric.")

  nRows <- nrow(df)
  nCols <- ncol(df)
  dfMean <- mean(df)

  if (verbose) {
    cat("Starting SIMPLICA:\n")
    cat(sprintf("  Extracting %d simplivariate components from a %d x %d matrix\n",
                numSimComp, nRows, nCols))
  }

  rawGAResults <- lapply(seq_along(mySeeds), function(seedIndex) {
    if (verbose) {
      cat(sprintf("  Run %d of %d. (seed: %d)\n",
                  seedIndex, length(mySeeds), mySeeds[seedIndex]))
    }
    
    GA::ga(
      type        = "real-valued",
      fitness     = fitness,
      df          = df,
      dfMean      = dfMean,
      penalty     = penalty,
      patternFunctions = patternFunctions,
      zeroFraction = zeroFraction,
      lower       = rep(0, nRows + nCols),
      upper       = rep(numSimComp, nRows + nCols),
      population  = gaintegerPopulationFactory(zeroFraction, verbose = verbose),
      selection   = GA::ga_lrSelection,
      crossover   = gaintegerOnePointCrossover,
      mutation    = gaintegerMutation,
      parallel    = FALSE,
      seed        = mySeeds[seedIndex],
      monitor     = if (verbose) SCAMonitorFactory(interval) else FALSE,
      pmutation   = pMutation,
      pcrossover  = pCrossover,
      elitism     = elitism,
      popSize     = popSize,
      maxiter     = maxIter
    )
  })

  evalValues <- vapply(rawGAResults, function(x) x@fitnessValue, numeric(1))
  if (verbose) {
    cat(sprintf("  Fitness over %d seeds: %s\n",
                length(mySeeds), paste(round(evalValues, 2), collapse = ", ")))
  }

  sol <- rawGAResults[[which.max(evalValues)]]
  bestString <- as.numeric(sol@solution[1, ])

  bestResult <- fitness(bestString,
                        df = df,
                        dfMean = dfMean,
                        penalty = penalty,
                        patternFunctions = patternFunctions,
                        returnPatterns = TRUE)

  bestResult$string <- bestString
  bestResult$nRows <- nRows
  bestResult$nCols <- nCols
  class(bestResult) <- "simplica"

  if (isTRUE(doSimplicaCV)) {
    cvDefaults <- list(
      foundObject       = bestResult,
      df                = df,
      patternFunctions  = patternFunctions,
      patternFitters    = defaultPatternFitters(),
      preferenceOrder   = names(patternFunctions),
      nRepeats          = 40L,
      testFraction      = 0.2,
      minCellsForModels = 25L,
      parsimonyMargin   = 0.05,
      requireFitters    = TRUE,
      updateObject      = TRUE,
      verbose           = verbose
    )
    if (is.null(cvControl)) cvControl <- list()
    cvArgs <- utils::modifyList(cvDefaults, cvControl)
    if (verbose) cat("  Running CV...\n")
    bestResult <- do.call(simplicaCV, cvArgs)  # returns a simplica object if updateObject=TRUE
  }

  list(best = bestResult, raw = rawGAResults)
}

Try the SIMPLICA package in your browser

Any scripts or data that you put into this service are public.

SIMPLICA documentation built on Sept. 11, 2025, 1:08 a.m.