R/mt_pcalda.R
In mt: Metabolomics Data Analysis Toolbox

Documented in pcalda pcalda.default pcalda.formula plot.pcalda predict.pcalda print.pcalda print.summary.pcalda summary.pcalda tune.pcalda

#' =========================================================================
#' wll-02-10-2007: tune the best number of components
tune.pcalda <- function(x, y, ncomp = NULL, tune.pars, ...) {
  n <- nrow(x)
  g <- length(levels(y))

  if (is.null(ncomp)) {
    ncomp <- max(g, n - g)
  } else {
    if (ncomp < 1 || ncomp > max(g, n - g)) {
      ncomp <- max(g, n - g)
    }
  }

  if (missing(tune.pars)) {
    tune.pars <- valipars(sampling = "rand", niter = 1, nreps = 4)
  }

  cat("ncomp tune (", ncomp, "):", sep = "")
  res <- sapply(1:ncomp, function(i) {
    cat(" ", i, sep = "")
    flush.console()
    accest(x, y,
      pars = tune.pars, method = "pcalda", ncomp = i,
      tune = F, ...
    )$acc
  })
  cat("\n")

  list(ncomp = which.max(res), acc.tune = res)
}

#' =========================================================================
#' wll-02-10-2007: Tune the best number of components
#' NOTE: This is a debug version of tune.pcalda, which can take user defined
#'       range of ncomp.
tune.pcalda.1 <- function(x, y, ncomp = NULL, tune.pars, ...) {
  n <- nrow(x)
  g <- length(levels(y))

  if (is.null(ncomp)) {
    ncomp <- 1:max(g, n - g)
  } else {
    if (max(ncomp) < 1 || max(ncomp) > max(g, n - g)) {
      ncomp <- 1:max(g, n - g)
    }
  }

  if (missing(tune.pars)) {
    tune.pars <- valipars(sampling = "rand", niter = 1, nreps = 4)
  }

  #'  res <- sapply(1:ncomp, function(i) {
  #'    accest(x, y, pars = tune.pars, method = "pcalda", ncomp=i, tune=F,...)$acc
  #'  })
  cat("ncomp tune (", max(ncomp), "):", sep = "")
  func <- function(i) {
    cat(" ", i, sep = "")
    flush.console()
    accest(
      dat = x, cl = y, pars = tune.pars, method = "pcalda", ncomp = i,
      tune = F, ...
    )$acc
  }
  res <- sapply(ncomp, FUN = func)
  names(res) <- ncomp
  cat("\n")

  list(ncomp = which.max(res), acc.tune = res)
}

#' =========================================================================
#' wll-02-10-2007: tune the best number of components
#'  NOTE: This is a test version of tune ncomp, which uses LDA directly and
#'        runs fast than tune.pcalda.
tune.pcalda.2 <- function(x, y, ncomp = NULL, center = TRUE, scale. = FALSE,
                          tune.pars, ...) {
  n <- nrow(x)
  g <- length(levels(y))

  if (is.null(ncomp)) {
    ncomp <- max(g, n - g)
  } else {
    if (ncomp < 1 || ncomp > max(g, n - g)) {
      ncomp <- max(g, n - g)
    }
  }

  pca.out <- prcomp(x, center = center, scale. = scale., ...)
  ncomp <- min(ncomp, length(pca.out$center))

  if (missing(tune.pars)) {
    tune.pars <- valipars(sampling = "rand", niter = 1, nreps = 10)
  }

  cat("ncomp tune (", ncomp, "):", sep = "")
  res <- sapply(1:ncomp, function(i) {
    cat(" ", i, sep = "")
    flush.console()
    acc <- accest(pca.out$x[, 1:i, drop = F], y,
      pars = tune.pars,
      method = "lda"
    )$acc
  })
  cat("\n")

  list(ncomp = which.max(res), acc.tune = res)
}

#' =========================================================================
#' PCA+LDA for classification
#' History:
#'   wll-22-06-2007: commence
#'   wll-01-07-2007: try number of PCs as n - g. Over-fitting.
#'   wll-24-01-2008: strip off constant PCs within group
pcalda.default <- function(x, y, center = TRUE, scale. = FALSE, ncomp = NULL,
                           tune = FALSE, ...) {

  #' arguments validity checking
  if (missing(x) || missing(y)) {
    stop("data set or class are missing")
  }
  x <- as.matrix(x)
  if (nrow(x) != length(y)) stop("x and y don't match.")
  y <- as.factor(y)
  if (any(table(y) == 0)) stop("Can't have empty classes in y.")
  #' lwc-NOTE: Or simple apply: y <- factor(y), which will drop factor levels

  if (length(unique(y)) < 2) {
    stop("Classification needs at least two classes.")
  }
  if (any(is.na(x)) || any(is.na(y))) {
    stop("NA is not permitted in data set or class labels.")
  }

  n <- nrow(x)
  g <- length(levels(y))

  #' The singularity problem of the within-class scatter matrix is overcome
  #' if number of retained PCs varies at least g to at most n-g. Here g and
  #' n is the number of classes and training data, respectively.
  #' if(is.null(ncomp)) ncomp <- max(g,n - g)

  #' NOTE-wll: Too good for training, which means overfits the training data.

  #' if(is.null(ncomp)) ncomp <- max(g,round(n/2))

  #' Check the number of components
  if (is.null(ncomp)) {
    ncomp <- if (tune) max(g, n - g) else max(g, round(n / 2))
  } else {
    if (ncomp < 1 || ncomp > max(g, n - g)) {
      ncomp <- max(g, n - g)
    }
  }

  #' find the best number of components
  if (tune) {
    val <- tune.pcalda(x, y, ncomp, ...)
    ncomp <- val$ncomp
  }

  #' dimension reduction by PCA
  pca.out <- prcomp(x, center = center, scale. = scale., ...)
  ncomp <- min(ncomp, length(pca.out$center))
  x.tmp <- pca.out$x[, 1:ncomp, drop = F]

  #' stip off PCs constant within groups
  x.tmp <- preproc.const(x.tmp, y)
  ncomp <- ncol(x.tmp)
  #' NOTE-28-01-2008: If the variables being stript off is not in the end
  #'   of columns, they positions should be sorted somewhere. But this
  #'   situation is rare in PCs. Refer to predict.pcalda where the ncomp is
  #'   used.

  lda.out <- lda(x.tmp, y)
  pred <- predict(lda.out, x.tmp)
  conf <- table(y, pred$class)
  acc <- round(sum(diag(conf)) * 100 / n, 2)

  x <- scale(x.tmp, center = colMeans(lda.out$means), scale = FALSE) %*%
    lda.out$scaling

  res <- list(
    x = x, cl = y, pred = pred$class, posterior = pred$posterior,
    conf = conf, acc = acc, ncomp = ncomp, pca.out = pca.out,
    lda.out = lda.out
  )

  if (tune) res$acc.tune <- val$acc.tune
  res$call <- match.call()
  res$call[[1]] <- as.name("pcalda")
  class(res) <- "pcalda"
  return(res)
}

#' =========================================================================
predict.pcalda <- function(object, newdata, ...) {
  if (!inherits(object, "pcalda")) stop("object not of class \"pcalda\"")
  if (missing(newdata)) {
    res <- list(class = object$pred, posterior = object$posterior, x = object$x)
    return(res)
  }

  if (is.null(dim(newdata))) {
    dim(newdata) <- c(1, length(newdata))
  } #' a row vector

  newdata <- as.matrix(newdata)
  if (ncol(newdata) != length(object$pca.out$center)) {
    stop("wrong number of variables")
  }

  #' rotated data (projection) by PCA
  x <- predict(object$pca.out, newdata)
  x <- x[, 1:object$ncomp, drop = F]

  #' predict using LDA
  pred <- predict(object$lda.out, x)
  #' list(class=pred$class, posterior = pred$posterior, x = pred$x)

  return(pred)
}

#' ========================================================================
#' wll-22-06-2007: print method for pcalda
#' wll-15-01-2008: add ratio(svd values)
print.pcalda <- function(x, ...) {
  cat("\nCall:\n", deparse(x$call), "\n")
  cat("\nNumber of components considered:", x$ncomp)

  cat("\nConfusion matrix of training data:\n")
  print(x$conf)
  cat("\nRatio of between- and within-group s.d.:\n")
  df <- colnames(x$x)
  ratio <- x$lda.out$svd
  names(ratio) <- df
  print(ratio)
  cat("\nAccurary rate of training data:\n")
  print(x$acc)
  invisible(x)
}

#' ========================================================================
#' lwc-22-06-2007: summary method for pcalda
summary.pcalda <- function(object, ...) {
  structure(object, class = "summary.pcalda")
}

#' ========================================================================
#' lwc-22-06-2007: summary method for pcalda
print.summary.pcalda <- function(x, ...) {
  print.pcalda(x)

  lev <- levels(x$cl)
  cat("\nNumber of Classes: ", length(lev), "\n\n")
  cat("Levels:", if (is.numeric(lev)) "(as integer)", "\n", lev)
  cat("\n\n")
}

#' ========================================================================
#' wll-13-12-2007: plot method for pcalda using lattice.
#' wll-11-10-2008: Definition of svd in lda documents:
#'   svd is the singular values, which give the ratio of the between- and
#'   within-group standard deviations on the linear discriminant variables.
#'   Their squares are the canonical F-statistics.
#' wll-15-01-2008: add svd listed in the plot.
plot.pcalda <- function(x, dimen, ...) {
  ld.names <- function(object, comps) {
    labs <- paste("LD", 1:length(object$means), sep = "")
    if (missing(comps)) {
      comps <- seq(along = labs)
    } else {
      labs <- labs[comps]
    }
    svd <- object$svd
    svd.p <- 100 * svd / sum(svd)
    #' svd.p <- 100 * svd^2/sum(svd^2)
    #' wll-11-01-2008: check lda's print method: Proportion of trace
    svd <- svd[comps]
    svd.p <- svd.p[comps]

    #' labs <- paste(labs, " (", format(svd.p, digits = 2, trim = TRUE),
    #'               " %)", sep = "")
    labs <- paste(labs, " (", format(svd, digits = 2, trim = TRUE), ", ",
      format(svd.p, digits = 2, trim = TRUE), "%)",
      sep = ""
    )
    return(labs)
  }

  if (missing(dimen)) {
    dimen <- seq(along = colnames(x$x))
  } else {
    #' check validity
    if (!all(dimen %in% c(1:ncol(x$x)))) {
      stop("dimen is not valid")
    }
  }
  dfn <- ld.names(x$lda.out, dimen)

  y <- x$cl
  x <- data.frame(x$x[, dimen, drop = FALSE])
  names(x) <- dfn

  #' call group plot
  p <- grpplot(x, y, plot = "pairs", ...)
  p
}

#' =========================================================================
#' lwc-22-06-2007: plot method for pcalda. It plot LDA scores.
plot.pcalda.1 <- function(x, panel = panel.pcalda, cex = 0.7, dimen,
                          abbrev = FALSE, ...) {
  panel.pcalda <- function(x, y, ...) {
    text(x, y, as.character(g.nlda), cex = tcex, col = unclass(g), ...)
  }

  ld.names <- function(object, comps) {
    labs <- paste("LD", 1:length(object$means), sep = "")
    if (missing(comps)) {
      comps <- seq(along = labs)
    } else {
      labs <- labs[comps]
    }
    svd <- object$svd
    svd <- 100 * svd^2 / sum(svd^2)
    evar <- svd[comps]

    labs <- paste(labs, " (", format(evar, digits = 2, trim = TRUE),
      " %)",
      sep = ""
    )
    return(labs)
  }

  xval <- x$x
  g <- x$cl

  if (abbrev) levels(g) <- abbreviate(levels(g), abbrev)
  assign("g.nlda", g)
  assign("tcex", cex)

  if (missing(dimen)) {
    dimen <- seq(along = colnames(xval))
  } else {
    #' check validity
    if (!all(dimen %in% c(1:ncol(xval)))) {
      stop("dimen is not valid")
    }
  }

  xval <- xval[, dimen, drop = FALSE]
  varlab <- ld.names(x$lda.out, dimen)
  nDimen <- length(dimen)

  if (nDimen <= 2) {
    if (nDimen == 1) { #' One component
      ldahist(xval[, 1], g, ...)
      #' ldahist(xval, g, xlab=varlab,...)
    } else { #' Second component versus first
      xlab <- varlab[1]
      ylab <- varlab[2]
      eqscplot(xval, xlab = xlab, ylab = ylab, type = "n", ...)
      panel(xval[, 1], xval[, 2], ...)
    }
  } else { #' Pairwise scatterplots of several components
    pairs(xval, labels = varlab, panel = panel, ...)
  }
  invisible(NULL)
}

#' =========================================================================
pcalda <- function(x, ...) UseMethod("pcalda")

#' =========================================================================
pcalda.formula <- function(formula, data = NULL, ..., subset,
                           na.action = na.omit) {
  call <- match.call()
  if (!inherits(formula, "formula")) {
    stop("method is only for formula objects")
  }
  m <- match.call(expand.dots = FALSE)
  if (identical(class(eval.parent(m$data)), "matrix")) {
    m$data <- as.data.frame(eval.parent(m$data))
  }
  m$... <- NULL
  m[[1]] <- as.name("model.frame")
  m$na.action <- na.action
  m <- eval(m, parent.frame())
  Terms <- attr(m, "terms")
  attr(Terms, "intercept") <- 0
  x <- model.matrix(Terms, m)
  y <- model.extract(m, "response")
  attr(x, "na.action") <- attr(y, "na.action") <- attr(m, "na.action")

  ret <- pcalda.default(x, y, ..., na.action = na.action)

  ret$call <- call
  ret$call[[1]] <- as.name("pcalda")
  ret$terms <- Terms
  if (!is.null(attr(m, "na.action"))) {
    ret$na.action <- attr(m, "na.action")
  }
  class(ret) <- c("pcalda.formula", class(ret))
  return(ret)
}

#'  1) tune.pcalda
#'  2) tune.pcalda.1
#'  3) tune.pcalda.2
#'  4) pcalda.default
#'  5) predict.pcalda
#'  6) print.pcalda
#'  7) summary.pcalda
#'  8) print.summary.pcalda
#'  9) plot.pcalda
#' 10) plot.pcalda.1
#' 11) pcalda
#' 12) pcalda.formula

Any scripts or data that you put into this service are public.

mt documentation built on June 22, 2024, 12:24 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

mt
Metabolomics Data Analysis Toolbox

R/mt_pcalda.R
In mt: Metabolomics Data Analysis Toolbox

Defines functions pcalda.formula pcalda plot.pcalda.1 plot.pcalda print.summary.pcalda summary.pcalda print.pcalda predict.pcalda pcalda.default tune.pcalda.2 tune.pcalda.1 tune.pcalda

Documented in pcalda pcalda.default pcalda.formula plot.pcalda predict.pcalda print.pcalda print.summary.pcalda summary.pcalda tune.pcalda

Try the mt package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mt Metabolomics Data Analysis Toolbox

R/mt_pcalda.R In mt: Metabolomics Data Analysis Toolbox

Defines functions pcalda.formula pcalda plot.pcalda.1 plot.pcalda print.summary.pcalda summary.pcalda print.pcalda predict.pcalda pcalda.default tune.pcalda.2 tune.pcalda.1 tune.pcalda

Documented in pcalda pcalda.default pcalda.formula plot.pcalda predict.pcalda print.pcalda print.summary.pcalda summary.pcalda tune.pcalda

Try the mt package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mt
Metabolomics Data Analysis Toolbox

R/mt_pcalda.R
In mt: Metabolomics Data Analysis Toolbox