R/mult-LDA.R

Defines functions reLDA.Coe reLDA.PCA reLDA.default reLDA classification_metrics.LDA classification_metrics.table classification_metrics print.LDA LDA.PCA LDA.Coe LDA.default LDA .restore_LDs

Documented in classification_metrics LDA LDA.default LDA.PCA reLDA reLDA.Coe reLDA.default reLDA.PCA

.restore_LDs <- function(x){
  ids_drop <- c(x$ids_collinear, x$ids_constant)
  # nothing dropped case
  if (length(ids_drop)==0)
    return(x$LDs)
  ids_keep <- setdiff(1:ncol(x$x), ids_drop)
  # transpose unnormalized LDs so that columns are coe, rows LDs
  x_keep <- x$LDs %>% t
  # # build x_drop matrix from mean dropped coe
  # x_drop <- apply(x$x[, ids_drop], 2, mean) %>%
  #   list %>% rep(nrow(x_keep)) %>% do.call("rbind", .)
  x_drop <- matrix(0, nrow(x_keep), length(ids_drop))
  # dimnames it
  if (!is.null(names(ids_drop)))
    colnames(x_drop) <- names(ids_drop)
  rownames(x_drop) <- rownames(x_keep)

  # reinsert columns
  res <- reinsert_columns(x_keep, x_drop, ids_keep, ids_drop)

  # restore dimnames (overchecking but possibly safer)
  if (!is.null(colnames(x$x)))
    colnames(res) <- colnames(x$x)
  if (!is.null(rownames(x_keep)))
    rownames(res) <- rownames(x_keep)
  t(res)
}

# LDA methods on Coe -------------

#' Linear Discriminant Analysis on Coe objects
#'
#' Calculates a LDA on [Coe] on top of [MASS::lda].
#'
#' @aliases LDA
#' @rdname LDA
#' @param x a  Coe or a PCA object
#' @param fac the grouping factor (names of one of the $fac column or column id)
#' @param retain the proportion of the total variance to retain (if retain<1) using \link{scree}, or the number of PC axis (if retain>1).
#' @param ... additional arguments to feed \link{lda}
#' @note For LDA.PCA, retain can be passed as a vector (eg: 1:5, and retain=1, retain=2, ...,
#' retain=5) will be tried, or as "best" (same as before but retain=1:number_of_pc_axes is used).
#' @note Silent message and progress bars (if any) with `options("verbose"=FALSE)`.
#' @return a 'LDA' object on which to apply \link{plot.LDA}, which is a list with components:
#' \itemize{
#'  \item \code{x} any \link{Coe} object (or a matrix)
#'  \item \code{fac} grouping factor used
#'  \item \code{removed} ids of columns in the original matrix that have been removed since constant (if any)
#'  \item \code{mod} the raw lda mod from \link{lda}
#'  \item \code{mod.pred} the predicted model using x and mod
#'  \item \code{CV.fac} cross-validated classification
#'  \item \code{CV.tab} cross-validation tabke
#'  \item \code{CV.correct} proportion of correctly classified individuals
#'  \item \code{CV.ce} class error
#'  \item \code{LDs} unstandardized LD scores see Claude (2008)
#'  \item \code{mshape} mean values of coefficients in the original matrix
#'  \item \code{method} inherited from the Coe object (if any)
#' }
#' @family multivariate
#' @examples
#' bot.f <- efourier(bot, 24)
#' bot.p <- PCA(bot.f)
#' LDA(bot.p, 'type', retain=0.99) # retains 0.99 of the total variance
#' LDA(bot.p, 'type', retain=5) # retain 5 axis
#' bot.l <- LDA(bot.p, 'type', retain=0.99)
#' plot_LDA(bot.l)

#' bot.f <- mutate(bot.f, plop=factor(rep(letters[1:4], each=10)))
#' bot.l <- LDA(PCA(bot.f), 'plop')
#' plot_LDA(bot.l) # will replace the former soon
#' @export
LDA <- function(x, fac, retain, ...) {
  UseMethod("LDA")
}

#' @rdname LDA
#' @export
LDA.default <- function(x, fac, retain, ...) {

  # copy the original
  x0 <- x

  # some checks
  if (!is.matrix(x))
    x <- as.matrix(x)

  if (any(is.na(x)))
    stop("some NAs; LDA does not know how to handle them")

  if (missing(fac))
    stop("no fac provided")
  # dispatch fac
  f <- fac_dispatcher(x, fac)

  # check that at least two levels
  .check(nlevels(f)>1,
         "* less than two levels in the grouping passed")

  # handles constant and collinear variables
  ids_constant  <- which_constant(x)
  if (length(ids_constant)>0){
    ids_collinear <- which_collinear(x[, -ids_constant])
  } else {
    ids_collinear <- which_collinear(x)
  }

  # message about them, if verbose
  if (.is_verbose()){
    if (length(ids_constant)>0){
      if (is.null(colnames(x))){
        message("removed", length(ids_constant), "collinear columns")
      } else {
        message("removed these collinear columns:", paste(colnames(x)[ids_constant], collapse=", "))
      }
    }
    if (length(ids_collinear)>0){
      if (is.null(colnames(x))) {
        message("removed", length(ids_collinear), "collinear columns")
      } else {
        message("removed these collinear columns:", paste(colnames(x)[ids_collinear], collapse=", "))
      }
    }
  }

  # drop concerned columns
  ids_drop <- c(ids_constant, ids_collinear)
  if (length(ids_drop)>0)
    x <- x[, -c(ids_constant, ids_collinear)]

  # now we calculate two lda models with MASS::lda,
  #   1st - with  CV
  mod      <- MASS::lda(x = x, grouping=f, ...)
  mod.pred <- predict(mod, x)
  #   2nd - with leave-one-out cross validation
  CV.fac   <- MASS::lda(x=x, grouping=f,
                        tol = 1e-05, CV = TRUE, ...)$class

  # build a nice table from it
  CV.tab <- table(f, CV.fac)
  names(dimnames(CV.tab)) <- c("actual", "classified")
  CV.correct <- sum(diag(CV.tab))/sum(CV.tab)

  # return to unstandardized LDs
  n <- nrow(x)
  lm.mod <- lm(x ~ f)
  dfw <- n - nlevels(f)
  SSw <- var(lm.mod$residuals) * (n - 1)
  VCVw <- SSw/dfw
  LDs <- VCVw %*% mod$scaling

  # calculate class error
  tab <- CV.tab
  ce <- sapply(seq_along(1:nrow(tab)),
               function(i) 1-(sum(tab[i, -i])/sum(tab[i, ])))
  names(ce) <- rownames(tab)

  # we build the list to be returned
  res <- list(x = x0,
              f = f,
              mod = mod,
              mod.pred = mod.pred,
              CV.fac = CV.fac,
              CV.tab = CV.tab,
              CV.correct = CV.correct,
              CV.ce = ce,
              LDs = LDs,
              ids_constant=ids_constant,
              ids_collinear=ids_collinear)

  if (is.list(x0) && !is.null(x0$mshape))
    res$mshape <- x0$mshape

  class(res) <- unique(c("LDA", class(res)))
  return(res)
}

#' @export
LDA.Coe <- function(x, fac, retain, ...) {
  # since important warnings are handled
  # and replace by messages
  # and that if something bad happens,
  # MASS:lda would stop anyway
  coes <- x$coe
  f <- fac_dispatcher(x, fac)

  if (any(is.na(coes)))
    stop("some NAs in the coefficients provided; LDA does not know how to handle them")
  if (any(is.na(f)))
    stop("some NAs in the factor provided; LDA does not know how to handle them")

  res <- suppressWarnings(
    LDA(coes, f)
  )
  if (!is.null(x$method))
    res$method <- x$method
  if (!is.null(x$cuts))
    res$cuts <- x$cuts
  if (!is.null(x$fac))
    res$fac <- x$fac
  # if (!is.null(x$mshape))
  #   res$mshape <- x$mshape

  res$mshape <- apply(x$coe, 2, mean)

  res
}

#' @rdname LDA
#' @export
LDA.PCA <- function(x, fac, retain = 0.99, ...) {

  if (length(retain)==1 && retain < 1)
    retain <- scree_min(x, retain)
  if (length(retain)==1 && retain == "best")
    retain <- 1:ncol(x$x)

  # select best case
  if (length(retain)>1){
    discri <- numeric(length(retain))
    names(discri) <- paste0("PC1:", retain)
    for (i in seq_along(discri)){
      discri[i] <- LDA(x=x, fac=fac, retain = retain[i])$CV.correct
    }
    return(discri)
  }

  PCA <- x

  #fac handling
  f <- fac_dispatcher(x, fac)

  if (.is_verbose()) message(retain, " PC retained")
  X <- PCA$x[, 1:retain]
  if (is.matrix(X)) {
    remove <- which(apply(X, 2, sd) < 1e-10)
    if (length(remove) != 0) {
      cat(" * variables", colnames(X)[remove], "are removed since they are constant.\n")
      X <- X[, -remove]
    }
  } else {
    remove <- NULL
  }
  X <- as.matrix(X)
  # now we calculate two lda models with MASS::lda one with
  mod <- MASS::lda(X, grouping=f, tol = 1e-08, ...)
  mod.pred <- predict(mod, X)
  # leave-one-out cross validation
  CV.fac <- MASS::lda(X, grouping = f, tol = 1e-08, CV = TRUE, ...)$class
  # we build a nice table from it
  CV.tab <- table(f, CV.fac)
  names(dimnames(CV.tab)) <- c("actual", "classified")
  CV.correct <- sum(diag(CV.tab))/sum(CV.tab)
  # we calculate unstandardized LDs (wrong here for use in
  # shape reconstruction, would need one more step (PCA2shp?)
  # but not sure how useful it is)
  n <- nrow(X)
  lm.mod <- lm(X ~ f)
  dfw <- n - nlevels(f)
  SSw <- var(lm.mod$residuals) * (n - 1)
  VCVw <- SSw/dfw
  LDs <- VCVw %*% mod$scaling
  #
  # class error
  tab <- CV.tab
  ce <- numeric(nrow(tab))
  for (i in 1:nrow(tab)) ce[i] <- 1-(sum(tab[i, -i])/sum(tab[i, ]))
  names(ce) <- rownames(tab)

  LDA <- list(x = X,
              f = f,
              fac = x$fac,
              removed = remove,
              mod = mod,
              mod.pred = mod.pred,
              CV.fac = CV.fac,
              CV.tab = CV.tab,
              CV.correct = CV.correct,
              CV.ce = ce, LDs = LDs,
              mshape = NULL,
              method = "LDAPCA")  # may be interesting to add LDA on PCA here?
  class(LDA) <- c("LDA", class(LDA))
  return(LDA)
}

#' @export
print.LDA <- function(x, ...) {

  cat(" * Cross-validation table ($CV.tab):\n")
  print(x$CV.tab)

  cat("\n * Class accuracy ($CV.ce):\n")
  print(x$CV.ce)

  cat("\n * Leave-one-out cross-validation ($CV.correct): (",
      signif(x$CV.correct * 100, 3), "% - ",
      sum(diag(x$CV.tab)), "/", sum(x$CV.tab),
      "): \n", sep = "")

}

# LDA metrics -------

#' Calculate classification metrics on a confusion matrix
#'
#' In some cases, the class correctness or the proportion of correctly classified
#' individuals is not enough, so here are more detailed metrics when working on classification.
#'
#' @param x a \code{table} or an \link{LDA} object
#'
#' @return  a list with the following components is returned:
#' \enumerate{
#'   \item \code{accuracy}  the fraction of instances that are correctly classified
#'   \item \code{macro_prf} data.frame containing \code{precision}
#'   (the fraction of correct predictions for a certain class);
#'   \code{recall}, the fraction of instances of a class that were correctly predicted;
#'   \code{f1} the harmonic mean (or a weighted average) of precision and recall.
#'   \item \code{macro_avg}, just the average of the three \code{macro_prf} indices
#'   \item \code{ova} a list of one-vs-all confusion matrices for each class
#'   \item \code{ova_sum} a single of all ova matrices
#'   \item \code{kappa} measure of agreement between the predictions and the actual labels
#' }
#' @seealso  The pages below are of great interest to understand these metrics. The code
#' used is partley derived from the Revolution Analytics blog post (with their authorization). Thanks to them!
#' \enumerate{
#' \item \url{https://en.wikipedia.org/wiki/Precision_and_recall}
#' \item \code{https://blog.revolutionanalytics.com/2016/03/com_class_eval_metrics_r.html}
#' }
#' @family multivariate
#' @examples
#' # some morphometrics on 'hearts'
#' hearts %>% fgProcrustes(tol=1) %>%
#' coo_slide(ldk=1) %>% efourier(norm=FALSE) %>% PCA() %>%
#' # now the LDA and its summary
#' LDA(~aut) %>% classification_metrics()
#' @export
classification_metrics <- function(x){
  UseMethod("classification_metrics")
}

#' @export
classification_metrics.table <- function(x){
  tab <- x
  # check that a table is passed
  .check(is.table(tab),
         "only defined on 'table's")

  n <- sum(tab)           # nb of instances
  nc <- nrow(tab)         # nb of classes
  diag <- diag(tab)       # nb of correctly classified instances per class
  rowsums <- rowSums(tab) # nb of instances per class
  colsums <- colSums(tab) # nb of predictions per class
  p <- rowsums / n        # distribution of instances over the actual classes
  q <- colsums / n        # distribution of instances over the predicted classes

  # overall accuracy
  accuracy <- sum(diag) / n
  # precision, recall, F1
  precision <- diag / colsums
  recall <- diag / rowsums
  f1 <- 2 * precision * recall / (precision + recall)
  macro_prf <- dplyr::tibble(precision, recall, f1)

  # macro precision, recall, f1
  macro_avg <- dplyr::tibble(avg_precision=mean(precision),
                                 avg_recall=mean(recall),
                                 avg_f1=mean(f1))

  # one vs all
  ova = lapply(1 : nc,
               function(i){
                 v = c(tab[i,i],
                       rowsums[i] - tab[i,i],
                       colsums[i] - tab[i,i],
                       n-rowsums[i] - colsums[i] + tab[i,i]);
                 return(matrix(v, nrow = 2, byrow = T))})
  # nice names
  names(ova) <- colnames(tab)
  for (i in seq_along(ova)){
    dimnames(ova[[i]]) <- list(actual=c(rownames(tab)[i], "others"),
                               classified=c(colnames(tab)[i], "others"))
  }

  # we add all these matrices
  ova_sum <- Reduce("+", ova)
  dimnames(ova_sum) %<>% lapply(`[<-`, 1, "relevant")
  # micro <- dplyr::tibble(accuracy=sum(diag(ova_sum)) / sum(ova_sum),
  # prf=(diag(ova_sum) / apply(ova_sum, 1, sum))[1])

  expAccuracy = sum(p*q)
  kappa = (accuracy - expAccuracy) / (1 - expAccuracy)

  list(accuracy=accuracy,
       macro_prf=macro_prf,
       macro_avg=macro_avg,
       ova = ova,
       ova_sum = ova_sum,
       kappa=kappa
  )
}

#' @export
classification_metrics.LDA <- function(x){
  classification_metrics(x$CV.tab)
}

# classify --------
# #' Classify using LDA
# #'
# #' @param x a Coe
# #' @param fac a standalone factor, or the name or id of the $fac column to use.If it contains
# #' NAs, they will also be removed first from the x object
# #' @param ref at least two level names from `$fac` to use as a training subset of x
# #' @param unk same as above for one level name to classify
# #'
# #' @return a list with components:
# #' \itemize{
# #' \item \code{$N_ref} the number of elements in the training set
# #' \item \code{$N_unk} the number of elements in the unknown set
# #' \item \code{$counts} counts of classification of 'unk' in each class of 'ref'
# #' \item \code{$pc} same thing as percentages
# #' \item \code{$probs} same thing as posterior probabilities
# #' \item \code{$probs} same thing as posterior but as a data.frame
# #' }
# #'
# #' @examples
# #' table(olea$var)
# #' x <- opoly(olea, 5)
# #' classify(x, fac="var", ref=c("Aglan","Cypre"), unk="PicMa")
# #' @export
# #' classify <- function(x, fac, ref, unk){
# #'   UseMethod("classify")
# #' }
# #' @export
# #' classify.default <- function(x, fac, ref, unk){
# #'   stop("method only available for objects of class 'Coe'")
# #' }
# #'
# #' @export
# #' classify.Coe <- function(x, fac, ref, unk){
# #'   # so that we can directly pass a fac
# #'   if (!is.factor(fac)){
# #'     fac <- x$fac[, fac]
# #'   }
# #'   # fac <- fac_dispatcher(x, fac)
# #'   # if any NAs, we remove them
# #'   if (any(is.na(fac))) {
# #'     x  <- x %>% slice(which(!is.na(fac)))
# #'     fac <- fac %>% na.omit() %>% factor()
# #'   }
# #'   # we filter for levels of interest
# #'   all_id  <- (fac %in% c(ref, unk))
# #'   # cat(all_id)
# #'   x <- slice(x, all_id)
# #'   fac <- fac[all_id]
# #'   # calculate a PCA using all taxa
# #'   P0 <- PCA(x)
# #'   # calculate an LDA using all but the unknown taxa
# #'   ref_id <- fac != unk
# #'   L0 <- P0 %>%
# #'     slice(ref_id) %>%
# #'     LDA(fac[ref_id], retain=0.99)
# #'   # extract and prepare scores of the unknown taxa
# #'   unk_id <- fac == unk
# #'   P1_all <- P0 %>%
# #'     slice(unk_id)
# #'   P1 <- P1_all$x[, 1:ncol(L0$x)]
# #'
# #'   # classify using the MASS::lda
# #'   pred <- predict(L0$mod, P1)
# #'   # prepare the results as a list
# #'   counts <- table(pred$class)
# #'   N_unk  <- sum(counts)
# #'   pc     <- round((counts / sum(counts))*100, 2)
# #'   probs  <- pred$posterior
# #' #
# #' #   probs_fac <- cbind(pred$posterior, select(P1_all$fac, Site, Period)) %>%
# #' #     group_by(Site, Period) %>%
# #' #     summarise_each(funs(mean))
# #' #   probs_fac <- dplyr::bind_cols(dplyr::select(probs_fac, 1:2), round(select(probs_fac, 3)))
# #'   probs_fac <- NULL
# #'
# #'   return(list(N_ref=nrow(L0$x),
# #'               N_ref_tab=table(L0$fac),
# #'               N_unk=N_unk,
# #'               counts=counts,
# #'               pc=pc,
# #'               probs=probs,
# #'               probs_fac=probs_fac))
# #' }

# reLDA -----------

#' "Redo" a LDA on new data
#'
#' Basically a wrapper around \link{predict.lda} from the package MASS. Uses a LDA model
#' to classify new data.
#' @param newdata to use, a \link{PCA} or any \link{Coe} object
#' @param LDA a \link{LDA} object
#' @return a list with components (from ?predict.lda ).
#' \itemize{
#' \item class factor of classification
#' \item posterior posterior probabilities for the classes
#' \item x the scores of test cases
#' \item res data.frame of the results
#' \item CV.tab a confusion matrix of the results
#' \item CV.correct proportion of the diagonal of CV.tab
#' \item newdata the data used to calculate passed to predict.lda
#' }
#' @note Uses the same number of PC axis as the LDA object provided. You should probably use \link{rePCA} in
#' conjunction with reLDA to get 'homologous' scores.
#' @examples
#' # We select the first 10 individuals in bot,
#' # for whisky and beer bottles. It will be our referential.
#' bot1   <- slice(bot, c(1:10, 21:30))
#' # Same thing for the other 10 individuals.
#' # It will be our unknown dataset on which we want
#' # to calculate classes.
#' bot2   <- slice(bot, c(11:20, 31:40))
#'
#' # We calculate efourier on these two datasets
#' bot1.f <- efourier(bot1, 8)
#' bot2.f <- efourier(bot2, 8)
#'
#' # Here we obtain our LDA model: first, a PCA, then a LDA
#' bot1.p <- PCA(bot1.f)
#' bot1.l <- LDA(bot1.p, "type")
#'
#' # we redo the same PCA since we worked with scores
#' bot2.p <- rePCA(bot1.p, bot2.f)
#'
#' # we finally "predict" with the model obtained before
#' bot2.l <- reLDA(bot2.p, bot1.l)
#' bot2.l
#'
#' @rdname reLDA
#' @export
reLDA <- function(newdata, LDA){
  UseMethod("reLDA")
}

#' @rdname reLDA
#' @export
reLDA.default <- function(newdata, LDA){
  stop("method only defined for LDA objects")
}

#' @rdname reLDA
#' @export
reLDA.PCA <- function(newdata, LDA){
  #   if (missing(newdata) | !any(class(newdata) == "PCA"))
  #     stop(" * a PCA object must be provided")
  mod <- LDA$mod
  nc <- ncol(LDA$x)
  reLDA <- predict(mod, newdata$x[, 1:nc])
  #   return(reLDA)
  #   if (length(LDA$f0)==1) {
  #     actual <- newdata$fac[, LDA$f0]
  #     if (!is.null(actual)) {
  #       reLDA$res <- dplyr::tibble(actual=actual, classified=reLDA$class)
  #       reLDA$CV.tab <- table(reLDA$res)
  #       reLDA$CV.correct <- sum(diag(reLDA$CV.tab)) / sum(reLDA$CV.tab)
  #     }
  #   }
  reLDA$newdata <- newdata$x[, 1:nc]
  #   class(reLDA) <- "LDA"
  return(reLDA)
}

#' @rdname reLDA
#' @export
reLDA.Coe <- function(newdata, LDA){
  #   if (missing(newdata) | !any(class(newdata) == "PCA"))
  #     stop(" * a PCA object must be provided")
  mod <- LDA$mod
  if (!identical(colnames(LDA$x),  colnames(newdata$coe)))
    stop("LDA and newdata object do not have the same structure")
  return(predict(mod, newdata$coe))
}
vbonhomme/Momocs documentation built on Nov. 13, 2023, 8:54 p.m.