MApckg: R package of MetaboAnalyst functions

#' PLSDA cross validation
#'
#' Perform PLSDA classification and feature selection. 
#' Writes two output files: \code{"plsda_vip.csv"} and \code{"plsda_coef.csv"}. 
#'
#' @param dataSet List, data set object generated by \code{\link[MSdata]{MS_to_MA}} function.
#' @param analSet List, containing the results of statistical analysis (can be just an empty list).
#' @param methodName Cross validation method \code{"LOOCV"} or \code{"CV"}
#' @param compNum The maximum number of components to search.
#' @param choice The determining performance measure, one of \code{"Q2"}, \code{"R2"}, \code{"Accuracy"}
#' @return Native \code{analSet} with one added \code{$plsda} element consisting of:
#' \itemize{
#' \item\code{$best.num} - the detected best number of components
#' \item\code{$choice} - value of \code{choice} argument
#' \item\code{$coef.mat} - matrix of VIPs (variable importance in projection)
#' \item\code{$vip.mat} - matrix of weighted sums of absolute regression coefficients
#' \item\code{$fit.info} - all performance measures calculated for different number of components
#' }
#' @seealso \code{\link{PLS.Anal}}, \code{\link{PLS.Loadings}}, \code{\link{PLSDA.Permut}}, \code{\link{PlotPLS}}
#' @export

# classification and feature selection
PLSDA.CV<-function(dataSet, analSet, methodName="CV", compNum=GetDefaultPLSCVComp(dataSet), choice="Q2"){
	if (is.null(analSet$plsr)) stop("Please, conduct PLS.Anal and PLS.Loadings first.")
    match.arg(methodName, c("LOOCV", "CV"))
	match.arg(choice, c("Q2", "R2", "Accuracy"))
	# get classification accuracy using caret

    cls<-as.numeric(dataSet$cls)-1;
    datmat<-as.matrix(dataSet$norm);

    plsda.cls <- caret::train(dataSet$norm, dataSet$cls, "pls", trControl=caret::trainControl(method=ifelse(methodName == 'LOOCV', "LOOCV", 'CV')), tuneLength=compNum);

    # use the classifical regression to get R2 and Q2 measure
    plsda.reg <- pls::plsr(cls~datmat,method ='oscorespls', ncomp=compNum, validation= ifelse(methodName == 'LOOCV', "LOO", 'CV'));
    fit.info <- pls::R2(plsda.reg, estimate = "all")$val[,1,];

    # combine accuracy, R2 and Q2
    accu <- plsda.cls$results[,2]
    all.info <- rbind(accu, fit.info[,-1]);
    rownames(all.info) <- c("Accuracy", "R2", "Q2");

    # default use best number determined by Q2
    if(choice == 'Q2'){
        best.num <- which(all.info[3,] == max(all.info[3,]));
    }else if(choice == "R2"){
        best.num <- which(all.info[2,] == max(all.info[2,]));
    }else{
        best.num <- which(all.info[1,] == max(all.info[1,]));
    }

    # get coef. table, this can be error when class is very unbalanced  
    coef.mat <- try(caret::varImp(plsda.cls, scale=T)$importance);
    if(class(coef.mat) == "try-error") {
        coef.mat <- NULL;
    }else{
        if(dataSet$cls.num > 2){ # add an average coef for multiple class
            coef.mean<-apply(coef.mat, 1, mean);
            coef.mat <- cbind(coef.mean = coef.mean, coef.mat);
        }
        # rearange in decreasing order, keep as matrix, prevent dimesion dropping if only 1 col
        inx.ord<- order(coef.mat[,1], decreasing=T);
        coef.mat <- data.matrix(coef.mat[inx.ord, ,drop=FALSE]);
        write.csv(signif(coef.mat,5), file="plsda_coef.csv"); # added 27 Jan 2014
    }
    # calculate VIP http://mevik.net/work/software/VIP.R
    pls<-analSet$plsr;
    b <- c(pls$Yloadings)[1:compNum];
    T <- pls$scores[,1:compNum, drop = FALSE]
    SS <- b^2 * colSums(T^2)
    W <- pls$loading.weights[,1:compNum, drop = FALSE]
    Wnorm2 <- colSums(W^2);
    SSW <- sweep(W^2, 2, SS / Wnorm2, "*")
    vips <- sqrt(nrow(SSW) * apply(SSW, 1, cumsum) / cumsum(SS));
    if(compNum > 1){
        vip.mat <- as.matrix(t(vips));
    }else{
        vip.mat <- as.matrix(vips);
    }
    colnames(vip.mat) <- paste("Comp.", 1:ncol(vip.mat));
    write.csv(signif(vip.mat,5),file="plsda_vip.csv");

    analSet$plsda<-list(best.num=best.num, choice=choice, coef.mat=coef.mat, vip.mat=vip.mat, fit.info=all.info);
    return(analSet);
}

#' \code{PlotPLS.Classification} - plot PLSDA classification performance using different components
#' @rdname PlotPLS
#' @export

# Plot plsda classification performance using different components
PlotPLS.Classification<-function(dataSet, analSet, imgName="plsda_", format="png", dpi=72, width=NA){
    if (is.null(analSet$plsr)) stop("Please, conduct PLS.Anal and PLSDA.CV first.")
	if (is.null(analSet$plsda)) stop("Please, conduct PLSDA.CV first.")
	
	res<-analSet$plsda$fit.info;
    colnames(res) <- 1:ncol(res);
    best.num <- analSet$plsda$best.num;
    choice <- analSet$plsda$choice;
    imgName = paste(imgName, "dpi", dpi, ".", format, sep="");
    if(is.na(width)){
        w <- 7;
    }else if(width == 0){
        w <- 7;
        analSet$imgSet$pls.class<-imgName;
    }else{
        w <- width; 
    }
    h <- w*5/7;

    Cairo::Cairo(file = imgName, unit="in", dpi=dpi, width=w, height=h, type=format, bg="white");
    par(mar=c(5,5,2,7)); # put legend on the right outside
    barplot(res, beside = TRUE, col = c("lightblue", "mistyrose","lightcyan"), ylim= c(0,1.05), xlab="Number of components", ylab="Performance");

    if(choice == "Q2"){
        text((best.num-1)*3 + best.num + 2.5, res[3,best.num]+ 0.02, labels = "*", cex=2.5, col="red");
    }else if(choice == "R2"){
        text((best.num-1)*3 + best.num + 1.5, res[2,best.num]+ 0.02, labels = "*", cex=2.5, col="red");
    }else{
        text((best.num-1)*3 + best.num + 0.5, res[1,best.num]+ 0.02, labels = "*", cex=2.5, col="red");
    }

    # calculate the maximum y position, each bar is 1, place one space between the group
    xpos <- ncol(res)*3 + ncol(res) + 1;
    legend(xpos, 1.0, rownames(res), fill = c("lightblue", "mistyrose","lightcyan"), xpd=T);
    dev.off();
	frame()
	grid::grid.raster(png::readPNG(imgName));
}