R/crossValidFeatureNB.R

featureLog    <- c(0, 1, 1, 1, 0, 1, 1)
featureZscore <- c(1, 1, 1, 1, 0, 0, 1)
tolerance <- 0.7
lipidType <- "SPH"
indexFeature <- c(56, 38, 43, 64, 71, 82, 83)
nameFeature  <- c("RT", "AR", "HR", "FWHM", "rRT", "TF", "AF")
hist_scale <- "log10"
hist_binWidth <- 1

cvFolds <- 10
seed <- 1
k <- 10
#nameLibrary <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/testData/testthat_sampleSphingolipidsv109.xlsx"
#nameQsession <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/testData/listQsession_SPH.txt"
nameLibrary <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/files/Sphingolipidsv109.xlsx"
nameQsession <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/files/Qsession/cvfNB_datasets/SPH_Mouse_Brain.txt"

standard <- matrix(c("C16-D31 Ceramide", 264.3,
                     "SM(d18:1/18:1-d9)", 184.1),
                   ncol = 2, byrow = T)
setwd("/home/jchitpin/Downloads/Masters/Projects/MATRIX/Progress_Meetings/28_20_May_2019/")
library(data.table)
library(openxlsx)
library(ggplot2)
library(matrixStats)
library(caret)
library(igraph)
library(plyr)
library(datastructures)
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featureLibrary.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featurePrior.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/foldLibrary.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/methodMassInfo.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/percentNormality.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/splitFolds.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featureTransform.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/meltDT.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featureMeanVar.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/combinations.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/NB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/topcvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/loadTopcvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/dupMAP.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/nodupMAP.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/confusioncvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/isobarcvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/ggplot2_histPosteriorcvfNB.R")

crossValidFeatureNB <- function(featureZscore, featureLog, tolerance, lipidType,
                                indexFeature, cvFolds, seed, nameLibrary,
                                nameQsession, standard){

    ## Initialize outputs
    percentNorm <- matrix(0, nrow = 4, ncol = length(indexFeature))
    percentNorm <- lapply(seq_len(cvFolds), function(x) percentNorm)
    DT_all <- vector("list", length=length(indexFeature))
    DT_folds <- vector("list", length=length(indexFeature))
    DT_training <- vector("list", length=length(indexFeature))
    DT_tuning <- vector("list", length=length(indexFeature))
    DT_testing <- vector("list", length=length(indexFeature))
    DT_training_MV <- vector("list", length=length(indexFeature))
    DT_Full <- vector("list", length=cvFolds)

    ## Determining transition lists from MRM methods in library
    massMethod <- methodMassInfo(nameLibrary = nameLibrary)

    ## Progress message
    message(paste("Generating feature libraries at", Sys.time(), sep=" "))

    ## Compute feature matrix
    for(i in seq_along(indexFeature)){
        indexFeature_i <- indexFeature[i]

        ## Create feature matrix
        DT <- featureLibrary(lipidType = lipidType,
                             nameQsession = nameQsession,
                             nameLibrary = nameLibrary,
                             indexFeature = indexFeature_i,
                             standard = standard)
        DT_all[[i]] <- DT

        ## Print message
        message(paste0("Completed ", i, "/", length(indexFeature), " at ", Sys.time()))
    }
    rm(i, indexFeature_i, DT)

    ## Progress message
    message("Splitting feature matrices into folds")

    ## Split feature matrices into folds
    for(i in seq_along(DT_all)){
        DT_folds[[i]] <- foldLibrary(DT_featureLibrary = DT_all[[i]],
                            cvFolds = cvFolds,
                            seed = seed)
    }
    rm(i, DT_all)

    ## 'folds' are the folds within each feature
    for(folds in seq_along(DT_folds[[1]])){

        ## Progress message
        message(paste0("Fold ", folds, ":"))
        message("Populating testing, tuning, training matrices")

        ## 'feat' are the features
        for(feat in seq_along(DT_folds)){

            ## Split folds into testing, tuning, training sets
            combined <- splitFolds(indexFeature = indexFeature,
                                   DT_folds = DT_folds,
                                   feat = feat,
                                   folds = folds)
            DT_testing[[feat]]  <- combined$testing
            DT_tuning[[feat]]   <- combined$tuning
            DT_training[[feat]] <- combined$training
        }
        rm(feat, combined)

        ## Estimate normality of lipids in the training data
        for(i in seq_along(DT_training)){

            ## Progress message
            message(paste0("Estimating normality of lipids in training data fold ", folds, " for feature ", i, "/", length(DT_training)))

            ## Estimating normality
            DT <- DT_training[[i]]
            p_KS <- 1/10^ceiling(log10(nrow(DT)))
            percentNorm[[folds]][,i] <- percentNormality(DT = DT, p_KS = p_KS)
        }
        colnames(percentNorm[[folds]]) <- nameFeature
        rownames(percentNorm[[folds]]) <- c("Raw", "Log10", "Z-score", "Z-score(log10)")
        #save(percentNorm, file="percentNormality.rda")
        rm(DT)

        ## Progress message
        message("Applying transformations, if applicable.")

        ## Log10 or Z-score some features if transformations specified
        featureLog    <- as.logical(featureLog)
        featureZscore <- as.logical(featureZscore)

        ## Only log or Z-score the current [[folds]]
        ## Create a new copy of the sets for transformations because of data
        ## table assign by reference to prevent propagating transformations on
        ## transformations.
        ## DT_training/testing/tuning are the untransformed originals
        DT_trainingTransform <- copy(DT_training)
        DT_testingTransform  <- copy(DT_testing)
        DT_tuningTransform   <- copy(DT_tuning)

        for(i in seq_along(DT_training)){
            DT_trainingTransform[[i]] <- featureTransform(DT = DT_trainingTransform[[i]],
                                                          logBool = featureLog[i],
                                                          zBool   = featureZscore[i])
            DT_testingTransform[[i]]  <- featureTransform(DT = DT_testingTransform[[i]],
                                                          logBool = featureLog[i],
                                                          zBool   = featureZscore[i])
            DT_tuningTransform[[i]]   <- featureTransform(DT = DT_tuningTransform[[i]],
                                                          logBool = featureLog[i],
                                                          zBool   = featureZscore[i])
        }

        ## Progress message
        message(paste0("Estimating likelihood parameters for fold ", folds, "."))

        ## Calculate mean and variance of training data for likelihood estimation
        for(i in seq_along(DT_trainingTransform)){
            DT_training_MV[[i]] <- featureMeanVar(DT_featureLibrary = DT_trainingTransform[[i]])
        }
        names(DT_training_MV) <- nameFeature

        ## Progress message
        message(paste0("Estimating priors for fold ", folds, "."))

        ## Estimate priors from training data
        DT_prior <- featurePrior(DT_featureLibrary = DT_training[[1]], DT_methodMassInfo = massMethod)
        selectCol <- c("library.Q1", "library.Q3", "Barcode", "matrix", "prior")
        DT_prior <- DT_prior[, selectCol, with=F]
        DT_prior <- DT_prior[!duplicated(DT_prior[,c("Barcode", "matrix")]),]
        rm(selectCol)

        ## Melt tuning data
        for(i in seq_along(DT_tuningTransform)){
            DT_tuningTransform[[i]] <- meltDT(DT_tuningTransform[[i]])
        }
        names(DT_tuningTransform) <- nameFeature

        ## Melt testing data
        for(i in seq_along(DT_testingTransform)){
            DT_testingTransform[[i]] <- meltDT(DT_testingTransform[[i]])
        }
        names(DT_testingTransform) <- nameFeature
        rm(i)

        ## Progress message
        message("Generating feature combinations.")

        ## Matrix of feature combinations
        listFeatureChoices <- combinations(chooseK = length(indexFeature),
                                           nameFeature = nameFeature)

        ## Collection vector of incorrect assignments for tuning and testing set
        if(folds == 1){
            mismatchTuning <- vector(mode = "numeric",
                                            length=nrow(listFeatureChoices))
            mismatchTuning <- lapply(seq_len(cvFolds),
                                            function(x) mismatchTuning)
            mismatchTesting <- vector(mode = "numeric",
                                             length=nrow(listFeatureChoices))
            mismatchTesting <- lapply(seq_len(cvFolds),
                                             function(x) mismatchTesting)
        }

        ## Progress message
        message(paste0("Estimating posterior probabilities based on (transformed) feature combinations for fold ", folds, "."))

        ## Naive Bayes
        # output[[1]] returns all candidate assignments and posteriors
        # output[[2]] rows are best posteriors according to MWBM
        # output[[3]] rows are incorrect assignments
        for(i in 1:nrow(listFeatureChoices)){
            featureChoice <- nameFeature[as.logical(listFeatureChoices[i,])]

            ## Naive Bayes for tuning set
            nbTuning <- NB(DT = DT_tuningTransform,
                           Prior = DT_prior,
                           Likelihood_MV = DT_training_MV,
                           tolerance = tolerance,
                           Training = featureChoice)

            mismatchTuning[[folds]][i] <- nrow(nbTuning$incorrect)

            ## Naive Bayes for testing set
            nbTesting <- NB(DT = DT_testingTransform,
                            Prior = DT_prior,
                            Likelihood_MV = DT_training_MV,
                            tolerance = tolerance,
                            Training = featureChoice)

            mismatchTesting[[folds]][i] <- nrow(nbTesting$incorrect)

            ## Collect MWBM assignments
            DT_Full[[folds]][[i]]       <- nbTesting[[1]]
        }
        rm(DT_trainingTransform, DT_testingTransform, DT_tuningTransform, i)

        save(DT_Full, file = paste("Output_CV_", folds, ".rda", sep=""))
        DT_Full[[folds]] <- NA

        ## Progress message
        message(paste("Completed cross-validation fold", folds, "at", Sys.time(), sep=" "))
    }
    rm(DT_training_MV, DT_folds, DT_testing, DT_training, DT_tuning, p_KS)

    ## Creating feature combination matrix of false positives
    featureCombinationFP <- cbind(listFeatureChoices,
                                  Reduce("+", mismatchTuning),
                                  Reduce("+", mismatchTesting))

    ## Loading Naive Bayes output and subsetting feature combinations to top k
    nbOutput <- loadTopcvfNB(k = k,
                           listFeatureChoices = listFeatureChoices,
                           mismatchTuning = mismatchTuning,
                           mismatchTesting = mismatchTesting,
                           cvFolds = cvFolds)

    ## Calculating maximum a posteriori (MAP) (with/without duplicate barcode) assignments
    ## for every fold of the top k feature combinations (chosen by MWBM)
    nbOutput <- dupMAP(nbOutput = nbOutput)
    nbOutput <- nodupMAP(nbOutput = nbOutput)

    ## Generate confusion matrices for MAP (with/without duplicate barcode) assignments and
    ## MWBM across folds for all top k feature combinations
    dupMat <- confusioncvfNB(nbOutput = nbOutput,
                             nameColTF = "dupMAP")
    nodupMat  <- confusioncvfNB(nbOutput = nbOutput,
                                nameColTF = "nodupMAP")
    mwbmMat <- confusioncvfNB(nbOutput = nbOutput,
                              nameColTF = "MWBM")

    ## Create a table of isobaric transitions across all testing data folds
    isobarMat <- isobarcvfNB(nbOutput = nbOutput)

    ## Finding top k combinations and replace 1/0 matrix with column names
    interestingCombos <- topcvfNB(k = k,
                                  listFeatureChoices = listFeatureChoices,
                                  mismatchTuning = mismatchTuning,
                                  mismatchTesting = mismatchTesting)

    ## Replace 1/0 with appropriate feature name
    for(i in seq_along(interestingCombos)){
        ones <- which(listFeatureChoices[interestingCombos[i], ] == 1, arr.ind=T)
        listFeatureChoices[interestingCombos[i],][ones] <- colnames(listFeatureChoices)[ones]
        zeros <- which(listFeatureChoices[interestingCombos[i], ] == "0", arr.ind=T)
        listFeatureChoices[interestingCombos[i],][zeros] <- ""
    }

    ## Generate plots for each fold
    dir.create(file.path("posterior_histograms"), showWarnings = FALSE)
    dir <- getwd()
    for(i in 1:cvFolds){
        setwd(dir)
        dir.create(file.path(paste0(dir, "/posterior_histograms/fold_", i)), showWarnings = FALSE)
        setwd(paste0(dir, "/posterior_histograms/fold_", i))
        for(j in 1:k){
            ## Plot posterior probability histograms of the top k feature combinations for nodupMAP and MWBM
            g <- histPosteriorcvfNB(DT = nbOutput[[i]][[j]],
                               matNameFeature = listFeatureChoices[interestingCombos[j], ],
                               scaling = "log10",
                               binWidth = 1)
            ggsave(g, filename = paste0("Histogram_featureCombination_", j), device = "pdf", width = 8, height = 6)
        }
    }
    setwd(dir)


    #dir.create(file.path("QC_plots"), showWarnings = FALSE)
    #for(i in 1:cvFolds){
    #    setwd(dir)
    #    dir.create(file.path(paste0(dir, "/QC_plots/fold_", i)), showWarnings = FALSE)
    #    setwd(paste0(dir, "/QC_plots/fold_", i))
    #    for(j in 1:k){
    #        for(l in 1:nbOutput[[i]][[j]][, unique(run)]){
    #            g <- histPosteriorcvfNB(DT = nbOutput[[i]][[j]],
    #                                    matNameFeature = listFeatureChoices[interestingCombos[j], ],
    #                                    scaling = "log10",
    #                                    binWidth = 1)
    #            ggsave(g, filename = paste0("QC_featureCombination_", j), device = "pdf", width = 8, height = 6)
    #        }
    #    }
    #}
    #setwd(dir)

    ## Save all outputs into a combined list
    summary <- list(nbOutput=nbOutput,
                    confusionDupMat=dupMat,
                    confusionNodupMat=nodupMat,
                    confusionMwbmMat=mwbmMat,
                    isobarMat=isobarMat,
                    featureCombinationFP=featureCombinationFP,
                    percentNorm=percentNorm,
                    topKCombinations=listFeatureChoices[interestingCombos, ])
    save(summary, file = "cvfNB_summary.rda")

    return(summary)

}
# Function ends
jchitpin/blistR documentation built on July 8, 2019, 6:29 p.m.