MBXUCL: R Routines for metabolomic data

Documented in cvOPLSDA OPLSDA OPLSDA_pred

# =============================================================== fonction OPLSDA

# OPLS-DA for a two-class problem
#' @export OPLSDA
#' @title OPLS-DA for a two-class problem
#' @description OPLS-DA for a two-class problem. Only one response is allowed and the number of predictive components is fixed to 1 accordingly.
#'
#' @param x  A data matrix on which will be based the analysis.
#' @param y  A numerical vector representing the class of individuals.
#' @param impT If \code{TRUE}, prints the results.
#' @param impG If \code{TRUE}, save results' graphics in pdf format.
#' @param no Number of orthogonal components to keep.
#' @param out.path Path to output the results' graphics
#' @param nb Number of biomarkers to select.
#'
#' @return A list with the following elements:
#' \describe{
#'   \item{\code{b}}{Model coefficients}
#'   \item{\code{Tp}}{Predictive scores}
#'   \item{\code{Pp}}{Predictive loadings}
#'   \item{\code{W}}{X-weights}
#'   \item{\code{C}}{y-weights}
#'   \item{\code{Tortho}}{Orthogonal scores}
#'   \item{\code{Portho}}{Orthogonal loadings}
#'   \item{\code{Wortho}}{Orthogonal weights matrix}
#'   \item{\code{Selected.biomarkers}}{Vector of identified biomarkers}
#'   \item{\code{CV}}{Criterion for the number of orthogonal components to keep}
#'   \item{\code{original.dataset}}{Original X matrix}
#'   \item{\code{Xopls}}{OPLS-filtered X matrix}
#'   \item{\code{original.response}}{Original y vector}
#' }
#'
#' @details
#' The function allows only one predictive component since it is designed for a single dependent variable.
#' It is based on the NIPALS PLS algorithm: after the removal of orthogonal components from the X matrix, a PLS1
#' NIPALS is ran on the filtered X matrix.
#'
#' @examples
#' data('DataSimul')
#' x = DataSimul[['x']]
#' y = DataSimul[['y']]
#' oplsda.res = OPLSDA(x=x, y=y, impT = FALSE,impG = FALSE, no=2, nb = 15, out.path = '.')
#'
#' @importFrom grDevices dev.off pdf
#' @importFrom graphics abline barplot legend text title
#' @importFrom stats cov sd


OPLSDA <- function(x, y, impT = FALSE, impG = FALSE, no = 2, nb = 15, out.path = ".") {

  # checks

  # if (sum(!y %in% c(0,1)) >0) { y = as.factor(y) if (nlevels(y) >2) { stop('There
  # is more than 2 levels in y') } else { warning('levels of y are automaticaly
  # transformed in 0 and 1s') y[y==levels(y)[1]] = 0 y[y==levels(y)[2]] = 1 } }

  y <- as.numeric(y)

  checkArg(impT, "bool", can.be.null = FALSE)
  checkArg(impG, "bool", can.be.null = FALSE)
  checkArg(no, "int", can.be.null = FALSE)
  checkArg(nb, "int", can.be.null = FALSE)
  checkArg(out.path, "str", can.be.null = FALSE)


  xoriginal <- x
  varnames <- dimnames(x)[[2]]
  obsnames <- dimnames(x)[[1]]

  # center x and y
  ytrain <- scale(y,center = TRUE, scale = FALSE)

  colmeans <- apply(x, 2, mean)
  xtrain <- scale(x,center = TRUE, scale = FALSE)


  m <- dim(x)[2]  # nombre de descripteurs
  n <- dim(x)[1]

  if (nb > m){
    stop("nb is superior to the total number of descriptors, you need to reduce its value")
  }


  options(warn = -1)
  xax <- as.numeric(colnames(x))
  if (sum(is.na(xax)) > 0) {
    xax <- c(1:m)
  }
  options(warn = 1)


  # pls_nipals function ------------------------------

  pls_nipals <- function(xtrain, ytrain, np = 1) {

    xtrain <- scale(xtrain, center = T, scale = F)

    n <- dim(xtrain)[1]
    m <- dim(xtrain)[2]

    if (np < 1)  {
      stop("np should be at least 1")
    }

    Tp <- matrix(NA, ncol = np, nrow = n, dimnames = list(obsnames, NULL))
    Pp <- matrix(NA, ncol = np, nrow = m, dimnames = list(varnames, NULL))
    C <- matrix(NA, nrow = 1, ncol = np)
    W <- matrix(NA, ncol = np, nrow = m)

    for (j in 1:np)  {
      W[, j] <- as.vector((t(xtrain) %*% ytrain) %*% solve(t(ytrain) %*% ytrain))  # X-weights w
      norm.w <- sqrt(W[, j] %*% W[, j])  # norm of w (vector)
      W[, j] <- W[, j] %*% solve(norm.w)  # normalised w
      Tp[, j] <- (xtrain %*% W[, j]) %*% solve(t(W[, j]) %*% W[, j])  # X-scores
      C[, j] <- t(ytrain) %*% Tp[, j] %*% solve(t(Tp[, j]) %*% Tp[, j])  # y-weights
      Pp[, j] <- as.vector((t(xtrain) %*% Tp[, j]) %*% solve(t(Tp[, j]) %*%
        Tp[, j]))  # 1st PLS loading
      xtrain <- xtrain - Tp[, j] %*% t(Pp[, j])
    }

    b <- W %*% solve(t(Pp) %*% W) %*% as.vector(C)

    return(list(Tp = Tp, Pp = Pp, C = C, W = W, b = b))
  }


  # Initialisation des matrices de sortie de l'algorithme it\'eratif :
  Tortho <- matrix(NA, nrow = n, ncol = no, dimnames = list(obsnames, NULL))
  Portho <- matrix(NA, nrow = m, ncol = no, dimnames = list(varnames, NULL))
  Wortho <- matrix(NA, nrow = m, ncol = no)
  CV <- matrix(NA, nrow = 1, ncol = no)
  VarXortho <- matrix(NA, nrow = 1, ncol = no)
  VarX <- matrix(NA, nrow = 1, ncol = no)


  # Debut de l'algorithme de l'OPLS-DA avec le calcul des poids de X :
  w <- (t(xtrain) %*% ytrain) %*% solve(t(ytrain) %*% ytrain)  # X-weights w
  w <- as.vector(w)
  norm.w <- sqrt(w %*% w)  # norm of w (vector)
  w <- w %*% solve(norm.w)  # normalised w


  # creation de la boucle pour extraire les composantes orthogonales

  for (i in 1:no) {
    t <- (xtrain %*% w) %*% solve(t(w) %*% w)  # X-scores
    c <- t(ytrain) %*% t %*% solve(t(t) %*% t)  # y-weights
    u <- ytrain %*% c %*% solve(t(c) %*% c)  # y-scores
    p <- as.vector((t(xtrain) %*% t) %*% solve(t(t) %*% t))  # 1st PLS loading
    norm.p <- sqrt(p %*% p)
    Wortho[, i] <- as.vector(p - (w %*% (t(w) %*% p) %*% solve(t(w) %*% w)))  # Xortho-weights
    norm.wo <- sqrt(Wortho[, i] %*% Wortho[, i])
    Wortho[, i] <- Wortho[, i] %*% solve(norm.wo)  # normalised wortho
    CV[, i] <- norm.wo %*% solve(norm.p)  # criterion for the number of orthogonal components to keep
    Tortho[, i] <- xtrain %*% Wortho[, i] %*% solve(t(Wortho[, i]) %*% Wortho[,
      i])  # Xortho-scores
    Portho[, i] <- t(xtrain) %*% Tortho[, i] %*% solve(t(Tortho[, i]) %*% Tortho[,
      i])  # ortho. loading

    varortho <- Tortho[, i] %*% t(Portho[, i])
    var <- t %*% t(p)

    xtrain <- xtrain - Tortho[, i] %*% t(Portho[, i])

    # matrice var-cov
    covo <- cov(varortho)  # otho
    covp <- cov(xtrain)  # pred
    covorig <- cov(cbind(xoriginal))

    # total variation X
    ## var orthog retiree, en %
    VarXortho[, i] <- 100 * (sum(diag(covo))/sum(diag(covorig)))
    ## var non orthog, en %
    VarX[, i] <- 100 * (sum(diag(covp))/sum(diag(covorig)))

  }


  # X orthogonal
  xortho <- Tortho %*% t(Portho)


  # Apply nipals pls on filtered X matrix (Xnew)
  xnew <- xtrain  # deflated matrix
  np = 1
  res_nipals <- pls_nipals(xnew, ytrain, np = np)

  invisible(list2env(res_nipals, envir = environment()))

  Tp = res_nipals[["Tp"]]
  Pp = res_nipals[["Pp"]]
  C = res_nipals[["C"]]
  W = res_nipals[["W"]]
  b = res_nipals[["b"]]

  b <- as.vector(b)

  # In order to apply b directly on a new X, b becomes bcorr
  bcorr <- (diag(1, m, m) - Wortho %*% solve(t(Portho) %*% Wortho) %*% t(Portho)) %*%  b
  names(bcorr) <- varnames
  bcorr <- as.vector(bcorr)


  # Recherche des nb biomarkeurs ayant les b les plus grands
  namindbiom <- order(abs(b))[((m + 1) - (1:nb))]
  indbiom <- b[namindbiom]


  # sortie
  ropls <- list(b = bcorr, Tp = Tp, Pp = Pp, W = W, C = C, Tortho = Tortho, Portho = Portho,
    Wortho = Wortho, Selected.biomarkers = indbiom, CV = CV, original.dataset = xoriginal,
    Xopls = xnew, original.response = y)


  # Sorties graphiques

  if (impG == TRUE) {

    # Validation plot
    # pdf(file.path(out.path, "OPLS_Validation.pdf"), width = 10, height = 6)
    #
    # COL <- rep("gray93", no)
    # # par(mar=c(4,4,4,4))
    # mp <- barplot(t(CV), axes = F, axisnames = F, border = 1, col = COL)
    # axis(1, at = mp, labels = c(1:no))
    # axis(2)
    # title(main = "OPLS: Choice of the n[orthog. Components]", xlab = "Orthogonal OPLS-DA Components",
    #   ylab = "||Wortho|| / ||p||")
    #
    # dev.off()


    # Xvar explique
    pdf(file.path(out.path, "OPLS_Xvar.pdf"), width = 6, height = 6)

    COL <- rep("gray93", no + 1)
    COL[1] <- "cyan1"
    mp1 <- graphics::barplot(c(VarX[no], VarXortho), axes = F, axisnames = F,
                            width = rep(0.7, no), border = 1, col = COL)
    axis(1, at = mp1, tick = F, labels = c("PC", paste0("OC", 1:no)))
    axis(2)
    mtext(paste(round(c(VarX[no], VarXortho), 1), "%", sep = " "), col = "gray40",
          at = mp1, side = 1)
    title(main = "OPLS: Percentage of X-variance \n explained by each component",
          xlab = "OPLS Components", ylab = "% X-variance")

    dev.off()

    # OPLS-DA score scatter plot
    index <- ceiling(no/2)

    pdf(file.path(out.path, "OPLS_Score.pdf"), width = 12, height = 6 * index)
    col <- numeric()
    col[ytrain == 0] <- 3
    col[ytrain == 1] <- 2

    par(mfrow = c(ceiling(no/2), 2))
    for (i in 1:no) {
      max.pc1 <- 1.1 * (max(abs(Tp[, np])))
      max.pc2 <- 1.1 * (max(abs(Tortho[, i])))

      lim <- c()
      if (max.pc1 > max.pc2) {
        lim <- c(-max.pc1, max.pc1)
      } else  {
        lim <- c(-max.pc2, max.pc2)
      }

      plot(Tp[,np], Tortho[, i], col = col, pch = 19, xlim = lim, ylim = lim,
          xlab = paste0("Predictive T score [", 1, "]"),
          ylab = paste0("Orthogonal T score [",i, "]"), main = "OPLS-DA score scatter plot")
      abline(h = 0, v = 0, lty = 2, col = "gray")
      legend(x = "topright", border = T, y.intersp = 0.7, cex = 1, pch = 19,
            legend = list("Group1", "Group0"), col = c(2, 3))
    }

    dev.off()


    # S-plot correlation and covariance matrices covariance
    s <- as.matrix(x, ncol = ncol(x))
    p1 <- c()
    for (i in 1:ncol(s))  {
      scov <- cov(s[, i], Tp[, np])
      p1 <- matrix(c(p1, scov), ncol = 1)  # covariance x-T
    }
    # correlation
    pcorr1 <- c()
    Tno <- as.matrix(Tp[, np], ncol = 1)
    for (i in 1:nrow(p1)) {
      den <- apply(Tno, 2, sd, na.rm = TRUE) * sd(s[, i])
      corr1 <- p1[i, ]/den
      pcorr1 <- matrix(c(pcorr1, corr1), ncol = 1)  # correlation
    }

    # plot
    pdf(file.path(out.path, "OPLS_Splot.pdf"), width = 10, height = 8)

    par(mfrow=c(1,1))
    plot(p1, pcorr1, xlab = "p(cov)[1]", ylab = "p(corr)[1]", main = "S-plot (OPLS-DA)",
        ylim = c(min(pcorr1, na.rm = T) * 1.1, max(pcorr1, na.rm = T) * 1.1),
        xlim = c(min(p1, na.rm = T) * 1.1, max(p1, na.rm = T) * 1.1))
    sel <- p1 * pcorr1
    sel <- order(sel, decreasing = TRUE)[1:nb]
    text(p1[sel], pcorr1[sel], labels = colnames(s)[sel], cex = 0.7, pos = 1)
    abline(v = 0, lty = 2)
    abline(h = 0, lty = 2)

    dev.off()


    ######################################## plot of the biomarker coefficients ##

    # OPLS pretreated PLS coefficients
    delta <- mean(sort(abs(b))[m - nb + c(0, 1)])

    pdf(file.path(out.path, "OPLS_coef.pdf"), width = 10, height = 6)

    par(mar = c(4, 2, 2, 1))

    plot(b, type = "l", xaxt = "n", yaxt = "n", main = "OPLS: Vector of descriptors' rank", xlab = "ppm")
    abline(h = 0)
    abline(h = delta * c(-1, 1), lty = 2)
    axis(side = 2, cex.axis = 0.7)
    axis(side = 1, at = c(1, seq(50, m, 50)), labels = xax[c(1, seq(50, m, 50))],
      cex.axis = 0.7)

    dev.off()



    # loadings plot
    pdf(file.path(out.path, "OPLS_LoadingPred.pdf"), width = 10, height = 6)

    plot(Pp[, np], xaxt = "n", type = "l", main = "OPLS pretreated PLS coefficients (loadings)")
    axis(side = 2, cex.axis = 0.7)
    axis(side = 1, at = c(1, seq(50, m, 50)), labels = xax[c(1, seq(50, m, 50))],
      cex.axis = 0.7)

    dev.off()

    graphics.off()
  }

  ########################### text impression
  if (impT == TRUE)
  { # cat('\n Dataset', dataname)
    print((ropls))
  }
  return(ropls)
}



# ===============================================================
# =============================================================== Fonction OPLSDA_pred
#' @export OPLSDA_pred
#' @title Prediction for OPLS-DA
#'
#' @description
#' Prediction of a new set of observations based on a built model.
#'
#' @param ropls Result from OPLS-DA analysis with the OPLSDA function.
#' @param x.new A vector or matrix of new observations.
#'
#' @return A vector with predicted y values.
#'
#' @examples
#' data('DataSimul')
#' x = DataSimul[['x']]
#' y = DataSimul[['y']]
#' oplsda.res = OPLSDA(x=x[-c(1:5),], y=y[-c(1:5)],
#'          impT = FALSE,impG = FALSE, no=2, nb = 15, out.path = '.')
#' OPLSDA_pred(ropls = oplsda.res, x.new = x[c(1:5),])


OPLSDA_pred <- function(ropls, x.new) {
  x.new <- x.new - colMeans(ropls$original.dataset)
  y.pred <- x.new %*% ropls$b
  y.pred <- y.pred + mean(ropls$original.response)
  names(y.pred) <- rownames(x.new)
  return(y.pred)
}


# ===============================================================

############################## Fonction cvOPLSDA
#' @export cvOPLSDA
#' @title K-fold cross-validation for OPLS-DA.
#'
#' @description
#' K-fold cross-validation for OPLS-DA based on the RMSE criterion and stratified according to y.
#'
#' @param x A data matrix on which will be based the analysis.
#' @param y A numerical vector representing the class of individuals.
#' @param k_fold The number of sub-datasets to create for cross-validation.
#' @param NumOrtho The maximum number of orthogonal components allowed in OPLSDA.
#' @param ImpG If \code{TRUE}, prints a validation plot.
#'
#' @return A list with the following elements:
#' \describe{
#'   \item{\code{RMSECV}}{RMSECV from the OPLSDA cross-validation.}
#'   \item{\code{folds_i}}{A vector indicating the group of observations for cross-validation.}
#'   \item{\code{Prop1}}{Table of proportions of 1's in y for each subgroup.}
#'   \item{\code{yres}}{List with true and predicted classes for each \code{NumOrtho}.}
#' }
#'
#' @examples
#' data('DataSimul')
#' x = DataSimul[['x']]
#' y = DataSimul[['y']]
#' cvOPLSDA.res = cvOPLSDA(x = x, y = y, k_fold = 10, NumOrtho = 5)
#' plot(cvOPLSDA.res$RMSECV)
#'
#' @importFrom caret createFolds

cvOPLSDA <- function(x, y, k_fold = 10, NumOrtho = 1, ImpG = FALSE) {

  # checks if (sum(!y %in% c(0,1)) >0) { warning('y is not a vector of 1 and 0s') }

  checkArg(k_fold, "int", can.be.null = FALSE)
  checkArg(NumOrtho, "int", can.be.null = FALSE)

  #library("plyr")

  # df <- data.frame(rowname = rownames(x), Class = y)
  # n <- dim(x)[1]

  # createFolds <- function(x, k) {
  #   n <- nrow(x)
  #   x$FOLDS <- rep(1:k, length.out = n)[sample(n, n)]
  #   x
  # }
  #
  #
  # folds <- plyr::ddply(.data = df, .variables = .(df$Class), .fun = plyr::here(createFolds), k = k_fold)
  # folds_i <- folds$FOLDS
  #
  # # Prop1 = plyr::ddply(.data = folds,.variables = .(FOLDS) ,.fun =
  # # plyr::here(plyr::summarise), prop = sum(.(Class))/length(.(Class)))
  #
  # Prop1 <- stats::aggregate(folds$Class, by = list(Category = folds$FOLDS), FUN = mean)

  df <- data.frame(rowname = rownames(x), Class = y)
  n <- dim(x)[1]

  # indication of in which fold are the samples
  folds <- caret::createFolds(df$Class, k = k_fold, list = FALSE)

  df$fold <- folds
  # mean value of the response per fold
  # Prop1 <- plyr::ddply(.data = df, .variables = "fold", .fun = summarise, prop = mean(Class))
  Prop1 <- as.matrix(table(df$fold, df$Class)/as.vector(table(df$fold)))[,1, drop=FALSE]
  #######

  index <- vector("list", k_fold)
  Xtrain <- vector("list", k_fold)
  Ytrain <- vector("list", k_fold)
  Xnew <- vector("list", k_fold)
  for (k in 1:k_fold) {
    index[[k]] <- which(folds == k)
    Xtrain[[k]] <- x[-index[[k]], ]
    Ytrain[[k]] <- y[-index[[k]]]
    Xnew[[k]] <- x[index[[k]], ]
  }

  RMSECV <- c()
  yres <- vector("list")
  for (j in 1:NumOrtho) {

    ropls.train <- mapply(OPLSDA, x = Xtrain, y = Ytrain, MoreArgs = list(impT = FALSE,
                          impG = FALSE, no = j, out.path = "."), SIMPLIFY = FALSE, USE.NAMES = TRUE)

    ypred <- mapply(OPLSDA_pred, ropls = ropls.train, x.new = Xnew, SIMPLIFY = FALSE,
                    USE.NAMES = TRUE)

    Ypred <- unlist(ypred)
    sorted <- order(names(Ypred))
    Ypred <- Ypred[order(names(Ypred))]

    ytrue <- df$Class
    names(ytrue) <- df$rowname
    ytrue <- ytrue[order(names(ytrue))]

    yres[[j]] <- cbind(ytrue = ytrue, Ypred = Ypred)

    RMSECV[(j)] <- sqrt(sum((ytrue - Ypred)^2)/n)

  }

  rcvOPLSDA <- list(RMSECV = RMSECV, folds_i = folds, Prop1 = Prop1, yres = yres)


  if (ImpG == TRUE) {
    plot(RMSECV, xaxt = "n", ylab = "RMSECV", main = "Validation plot", type = "b",
         xlab = "Number of orthogonal components")
    axis(side = 1, at = c(1:NumOrtho), labels = c(1:NumOrtho))
  }
  return(rcvOPLSDA)

}