R/pls-vip.R

Defines functions plot_pls_vip create_vip_rects extract_pls_vip VIPjh VIP

Documented in create_vip_rects extract_pls_vip plot_pls_vip

### VIP.R: Implementation of VIP (variable importance in projection)(*) for the
### `pls' package.
### $Id: VIP.R,v 1.2 2007/07/30 09:17:36 bhm Exp $

### Copyright: 2006,2007 Bjoern-Helge Mevik
### This program is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License version 2 as
### published by the Free Software Foundation.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.

### A copy of the GPL text is available here:
### http://www.gnu.org/licenses/gpl-2.0.txt

### Contact info:
### Boejrn-Helge Mevik
### bhx6@mevik.net
### Roedtvetvien 20
### N-0955 Oslo
### Norway

### (*) As described in Chong, Il-Gyo & Jun, Chi-Hyuck, 2005, Performance of
### some variable selection methods when multicollinearity is present,
### Chemometrics and Intelligent Laboratory Systems 78, 103--112.

## VIP returns all VIP values for all variables and all number of components,
## as a ncomp x nvars matrix.
VIP <- function(object) {
    # pb: added to avoid `R CMD check` note
    method <- Yloadings <- scores <- loading.weights <- NULL
    if (object$method != "oscorespls")
        stop("Only implemented for orthogonal scores algorithm.  Refit with 'method = \"oscorespls\"'")
    if (nrow(object$Yloadings) > 1)
        stop("Only implemented for single-response models")

    SS <- c(object$Yloadings)^2 * colSums(object$scores^2)
    Wnorm2 <- colSums(object$loading.weights^2)
    SSW <- sweep(object$loading.weights^2, 2, SS / Wnorm2, "*")
    sqrt(nrow(SSW) * apply(SSW, 1, cumsum) / cumsum(SS))
}


## VIPjh returns the VIP of variable j with h components
VIPjh <- function(object, j, h) {
    # pb: added to avoid `R CMD check` note
    method <- Yloadings <- scores <- loading.weights <- NULL
    if (object$method != "oscorespls")
        stop("Only implemented for orthogonal scores algorithm.  Refit with 'method = \"oscorespls\"'")
    if (nrow(object$Yloadings) > 1)
        stop("Only implemented for single-response models")

    b <- c(object$Yloadings)[1:h]
    T <- object$scores[,1:h, drop = FALSE]
    SS <- b^2 * colSums(T^2)
    W <- object$loading.weights[,1:h, drop = FALSE]
    Wnorm2 <- colSums(W^2)
    sqrt(nrow(W) * sum(SS * W[j,]^2 / Wnorm2) / sum(SS))
}

#' @title Extract VIPs (variable importance in the projection) for a PLS
#' regression model output returned from model fitting with
#' \code{simplerspec::fit_pls()}
#' @description VIPs are extracted based on the \code{finalModel} sublist
#' in the \code{caret::train} output contained in the \code{model} element
#' of the \code{simplerspec::fit_pls()} model output list. The VIPs for
#' derived number of PLS components in the \code{finalModel} are computed.
#' @param mout Model output list returned from \code{simplerspec::fit_pls()}.
#' @usage extract_pls_vip(mout)
#' @return A tibble data frame with columns \code{wavenumber} and correponding
#' VIP values in the column \code{vip} for the finally chosen PLS regression
#' model at the final number of PLS components.
#' @export
extract_pls_vip <- function(mout) {
  # Compute VIP for all wavenumbers and select only VIPs with ncomp in final
  # model
  final_model <- mout$model$finalModel
  vip <- VIP(object = final_model)[final_model$ncomp, ]
  # Collect wavenumbers from preprocessed spectra
  wn <- as.numeric(colnames(mout$data$calibration$spc_pre[[1]]))
  # Create a data frame with wavenumbers and VIP scores
  tibble::tibble(wavenumber = wn, vip = vip)
}

#' @title Create a data frame containing start and end positions (wavenumbers)
#' where variable importance in projection (VIP) is > 1
#' @description Given a data frame with VIP outputs (wavenumber and vip
#' columns), start and end values denoting spectral regions where VIP > 1
#' are returned as data frame. The functions can be used as helper
#' function for plotting VIP.
#' @param df_vip Data frame containing \code{wavenumber} and \code{vip} columns
#' (numeric)
#' @return Data.frame containing vectors \code{start} (numeric; wavenumber),
#' \code{end} (numeric; wavenumber) and group (integer; values are
#' \code{1:length(start))}.
#' @export
create_vip_rects <- function(df_vip) {

  # Avoid `R CMD check NOTE`: `no visible binding for global variable`
  VIP <- wavenumber <- tail <- NULL

  # Highlight region data
  # https://stackoverflow.com/questions/32543176/highlight-areas-within-certain-x-range-in-ggplot2
  v <- rep(0, nrow(df_vip))
  v[df_vip$vip > 1] <- 1
  # Get the start and end points for highlighted regions
  inds <- diff(c(0, v))
  start <- df_vip$wavenumber[inds == 1]
  end <- df_vip$wavenumber[inds == -1]
  if (length(start) > length(end)) {
    end <- c(end, tail(df_vip$wavenumber, 1))
  }
  # Create data frame for rectangle layer (geom_rects)
  data.frame(start = start, end = end, group = seq_along(start))
}

#' @title Plot stacked ggplot2 graphs with the Variable Importance for the
#' Projection (VIP) scores, mean replicate spectra (absorbance) per sample_id,
#'and the preprocessed spectra.
#' @description Plot stacked ggplot2 graphs of VIP for the final
#' PLS regression model output of the calibration (training) data set for the
#' final number of components, raw (replicate mean) spectra, and preprocessed
#' spectra. Regions with VIP > 1 are highlighted across the stacked graphs
#' in beige colour rectangles. VIP calculation is implemented as described in
#' Chong, I.-G., and Jun, C.-H. (2005). Performance of some variable selection
#' methods when multicollinearity is present. Chemometrics and Intelligent
#' Laboratory Systems, 78(1--2), 103--112. https://doi.org/10.1016/j.chemolab.2004.12.011
#' @param mout Model output list that is returned from
#' \code{simplerspec::fit_pls()}. This object contains a nested list with
#' the \code{caret::train()} object (class \code{train}), based on which
#' VIPs at finally selected number of PLS components are computed.
#' @param y1 Character vector of list-column name in
#' \code{mout$data$calibration}, where spectra for bottom graph are extracted.
#' Default is \code{"spc_mean"}, which plots the mean calibration spectra after
#' resampling.
#' @param y2 Character string of list-column name in
#' \code{mout$data$calibration}, where spectra for bottom graph are extracted.
#' Default is \code{"spc_pre"}, which plots the preprocessed calibration
#' spectra after resampling.
#' @param by Character string that is used to assign spectra to the same group
#' and therefore ensures that all spectra are plotted with the same colour.
#' Default is \code{"sample_id"}
#' @param xlab Character string of X axis title for shared x axis of stacked
#' graphs. Default is \code{expression(paste("Wavenumber [", cm^-1, "]"))}
#' @param ylab1 Y axis title of bottom spectrum. Default is \code{"Absorbance"}.
#' @param ylab2 Y axis title of bottom spectrum. Default is
#' \code{"Preprocessed Abs."}.
#' @param alpha Double between 0 and 1 that defines transparency of spectra
#' lines in returned graph (ggplot plot object).
#' @usage plot_pls_vip(mout, y1 = "spc_mean", y2 = "spc_pre",
#'   by = "sample_id",
#'   xlab = expression(paste("Wavenumber [", cm^-1, "]")),
#'   ylab1 = "Absorbance", ylab2 = "Preprocessed Abs.",
#'   alpha = 0.2)
#' @export
plot_pls_vip <- function(mout, y1 = "spc_mean", y2 = "spc_pre",
                         by = "sample_id",
                         xlab = expression(paste("Wavenumber [", cm^-1, "]")),
                         ylab1 = "Absorbance", ylab2 = "Preprocessed Abs.",
                         alpha = 0.2) {

  # Avoid `R CMD check` NOTE: no visible binding for global variable `...`
  variable <- wavenumber <- value <- group <- vip <- NULL

  # Extract spectra tibble for calibration
  spc_tbl <- mout$data$calibration
  # Gather spectra in one data.table

  ifelse(y1 == "spc",
    {dt1 <- data.table::data.table(do.call(rbind, spc_tbl[, y1][[y1]]))},
    {dt1 <- data.table::rbindlist(spc_tbl[, y1][[y1]])}
  )

  ifelse(y2 == "spc",
    {dt2 <- data.table::data.table(do.call(rbind, spc_tbl[, y2][[y2]]))},
    {dt2 <- data.table::rbindlist(spc_tbl[, y2][[y2]])}
  )

  # Extract ID variable and append it to the data.table
  id <- spc_tbl[, by][[by]]
  dt1[, id := id]
  dt2[, id := id]

  # Convert data table to long form for ggplot2 plotting
  dt1_long <- data.table::melt(
    dt1, measure = names(dt1)[!names(dt1) %in% c("id")]
  )
  dt1_long[, variable := as.numeric(as.character(variable))]
  data.table::setnames(dt1_long, old = "variable", new = "wavenumber")

  dt2_long <- data.table::melt(
    dt2, measure = names(dt2)[!names(dt2) %in% c("id")]
  )
  dt2_long[, variable := as.numeric(as.character(variable))]
  data.table::setnames(dt2_long, old = "variable", new = "wavenumber")

  # Plot spectra and VIP scores
  brk <- pretty(as.numeric(names(dt1)[!names(dt1) %in% c("id")]),
    n = 10)
  maxmin <- function(x) {c(max(x), min(x))}
  x_lim <- maxmin(as.numeric(names(dt1)[!names(dt1) %in% c("id")]))

  # Extract VIP (Variable Importance in Projection) scores for the given
  # model
  df_vip <- extract_pls_vip(mout)

  # Determine highlighted regions above VIP = 1
  rects <- create_vip_rects(df_vip)

  # Plot for resampled and mean replicate spectra
  p_spc <- ggplot2::ggplot(dt1_long, ggplot2::aes(x = wavenumber, y = value)) +
    ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
      ggplot2::aes(xmin = start, xmax = end, ymin = min(dt1_long$value),
        ymax = max(dt1_long$value), group = group), color = "transparent",
        fill = "orange", alpha = 0.3) +
    ggplot2::geom_line(data = dt1_long, inherit.aes = FALSE,
      ggplot2::aes(x = wavenumber, y = value, group = id),
        alpha = alpha, size = 0.2) +
    ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
    ggplot2::labs(x = xlab, y = ylab1) +
    ggplot2::theme_bw() +
    ggplot2::theme(plot.margin = ggplot2::unit(c(1, 5, -30, 6),
      units = "points"), axis.text.x = ggplot2::element_blank())

    p_spc_pre <- ggplot2::ggplot(dt2_long, ggplot2::aes(wavenumber, value)) +
    ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
      ggplot2::aes(xmin = start, xmax = end, ymin = min(dt2_long$value),
        ymax = max(dt2_long$value), group = group), color = "transparent",
        fill = "orange", alpha = 0.3) +
    ggplot2::geom_line(ggplot2::aes(group = id), alpha = alpha, size = 0.2) +
    ggplot2::labs(x = xlab, y = ylab2) +
    ggplot2::theme_bw() +
    ggplot2::theme(plot.margin = ggplot2::unit(c(0, 5, 1, 1),
      units = "points")) +
    ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
    ggplot2::theme(plot.margin = ggplot2::unit(c(1, 5, -30, 6),
      units = "points"),
      axis.title.y = ggplot2::element_text(vjust = 0.25),
      axis.text.x = ggplot2::element_blank())

  # Plot for VIP
  # Extract PLS model response variable and number of components
  response <- as.character(unique(mout$stats$response))
  ncomp <- mout$model$finalModel$ncomp
  p_vip <- ggplot2::ggplot(data = df_vip,
      ggplot2::aes(x = wavenumber, y = vip)) +
    ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
      ggplot2::aes(xmin = start, xmax = end, ymin = min(df_vip$vip),
      ymax = max(df_vip$vip), group = group), color = "transparent",
      fill = "orange", alpha = 0.3) +
    ggplot2::geom_hline(yintercept = 1, colour = "red") +
    ggplot2::geom_line() +
    ggplot2::xlab(xlab) +
    ggplot2::ylab(
      bquote(paste(VIP[PLSR], " (", .(response),
        ", ", .(ncomp), " comps)"))) +
    ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
    ggplot2::theme_bw() +
    ggplot2::theme(plot.margin = ggplot2::unit(c(0, 5, 1, 1),
      units = "points"), axis.title.y = ggplot2::element_text(vjust = 0.25))

  # Arrange plots in two panels without any margins in between
  # Hints from
  # https://stackoverflow.com/questions/42567045/alignment-of-two-plots-using-grid-arrange
  cowplot::plot_grid(
    p_spc, p_spc_pre, p_vip, rel_heights = c(0.4, 0.3, 0.3),
    ncol = 1, align = "v")
}
philipp-baumann/simplerspec documentation built on Oct. 3, 2023, 12:13 p.m.