featureLog <- c(0, 1, 1, 1, 0, 1, 1)
featureZscore <- c(1, 1, 1, 1, 0, 0, 1)
tolerance <- 0.7
lipidType <- "SPH"
indexFeature <- c(56, 38, 43, 64, 71, 82, 83)
nameFeature <- c("RT", "AR", "HR", "FWHM", "rRT", "TF", "AF")
hist_scale <- "log10"
hist_binWidth <- 1
cvFolds <- 10
seed <- 1
k <- 10
#nameLibrary <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/testData/testthat_sampleSphingolipidsv109.xlsx"
#nameQsession <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/testData/listQsession_SPH.txt"
nameLibrary <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/files/Sphingolipidsv109.xlsx"
nameQsession <- "/home/jchitpin/Downloads/Masters/Projects/MATRIX/files/Qsession/cvfNB_datasets/SPH_Mouse_Brain.txt"
standard <- matrix(c("C16-D31 Ceramide", 264.3,
"SM(d18:1/18:1-d9)", 184.1),
ncol = 2, byrow = T)
setwd("/home/jchitpin/Downloads/Masters/Projects/MATRIX/Progress_Meetings/28_20_May_2019/")
library(data.table)
library(openxlsx)
library(ggplot2)
library(matrixStats)
library(caret)
library(igraph)
library(plyr)
library(datastructures)
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featureLibrary.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featurePrior.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/foldLibrary.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/methodMassInfo.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/percentNormality.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/splitFolds.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featureTransform.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/meltDT.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/featureMeanVar.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/combinations.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/NB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/topcvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/loadTopcvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/dupMAP.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/nodupMAP.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/confusioncvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/isobarcvfNB.R")
source("/home/jchitpin/Downloads/Masters/Projects/MATRIX/RPackages/blistR/R/ggplot2_histPosteriorcvfNB.R")
crossValidFeatureNB <- function(featureZscore, featureLog, tolerance, lipidType,
indexFeature, cvFolds, seed, nameLibrary,
nameQsession, standard){
## Initialize outputs
percentNorm <- matrix(0, nrow = 4, ncol = length(indexFeature))
percentNorm <- lapply(seq_len(cvFolds), function(x) percentNorm)
DT_all <- vector("list", length=length(indexFeature))
DT_folds <- vector("list", length=length(indexFeature))
DT_training <- vector("list", length=length(indexFeature))
DT_tuning <- vector("list", length=length(indexFeature))
DT_testing <- vector("list", length=length(indexFeature))
DT_training_MV <- vector("list", length=length(indexFeature))
DT_Full <- vector("list", length=cvFolds)
## Determining transition lists from MRM methods in library
massMethod <- methodMassInfo(nameLibrary = nameLibrary)
## Progress message
message(paste("Generating feature libraries at", Sys.time(), sep=" "))
## Compute feature matrix
for(i in seq_along(indexFeature)){
indexFeature_i <- indexFeature[i]
## Create feature matrix
DT <- featureLibrary(lipidType = lipidType,
nameQsession = nameQsession,
nameLibrary = nameLibrary,
indexFeature = indexFeature_i,
standard = standard)
DT_all[[i]] <- DT
## Print message
message(paste0("Completed ", i, "/", length(indexFeature), " at ", Sys.time()))
}
rm(i, indexFeature_i, DT)
## Progress message
message("Splitting feature matrices into folds")
## Split feature matrices into folds
for(i in seq_along(DT_all)){
DT_folds[[i]] <- foldLibrary(DT_featureLibrary = DT_all[[i]],
cvFolds = cvFolds,
seed = seed)
}
rm(i, DT_all)
## 'folds' are the folds within each feature
for(folds in seq_along(DT_folds[[1]])){
## Progress message
message(paste0("Fold ", folds, ":"))
message("Populating testing, tuning, training matrices")
## 'feat' are the features
for(feat in seq_along(DT_folds)){
## Split folds into testing, tuning, training sets
combined <- splitFolds(indexFeature = indexFeature,
DT_folds = DT_folds,
feat = feat,
folds = folds)
DT_testing[[feat]] <- combined$testing
DT_tuning[[feat]] <- combined$tuning
DT_training[[feat]] <- combined$training
}
rm(feat, combined)
## Estimate normality of lipids in the training data
for(i in seq_along(DT_training)){
## Progress message
message(paste0("Estimating normality of lipids in training data fold ", folds, " for feature ", i, "/", length(DT_training)))
## Estimating normality
DT <- DT_training[[i]]
p_KS <- 1/10^ceiling(log10(nrow(DT)))
percentNorm[[folds]][,i] <- percentNormality(DT = DT, p_KS = p_KS)
}
colnames(percentNorm[[folds]]) <- nameFeature
rownames(percentNorm[[folds]]) <- c("Raw", "Log10", "Z-score", "Z-score(log10)")
#save(percentNorm, file="percentNormality.rda")
rm(DT)
## Progress message
message("Applying transformations, if applicable.")
## Log10 or Z-score some features if transformations specified
featureLog <- as.logical(featureLog)
featureZscore <- as.logical(featureZscore)
## Only log or Z-score the current [[folds]]
## Create a new copy of the sets for transformations because of data
## table assign by reference to prevent propagating transformations on
## transformations.
## DT_training/testing/tuning are the untransformed originals
DT_trainingTransform <- copy(DT_training)
DT_testingTransform <- copy(DT_testing)
DT_tuningTransform <- copy(DT_tuning)
for(i in seq_along(DT_training)){
DT_trainingTransform[[i]] <- featureTransform(DT = DT_trainingTransform[[i]],
logBool = featureLog[i],
zBool = featureZscore[i])
DT_testingTransform[[i]] <- featureTransform(DT = DT_testingTransform[[i]],
logBool = featureLog[i],
zBool = featureZscore[i])
DT_tuningTransform[[i]] <- featureTransform(DT = DT_tuningTransform[[i]],
logBool = featureLog[i],
zBool = featureZscore[i])
}
## Progress message
message(paste0("Estimating likelihood parameters for fold ", folds, "."))
## Calculate mean and variance of training data for likelihood estimation
for(i in seq_along(DT_trainingTransform)){
DT_training_MV[[i]] <- featureMeanVar(DT_featureLibrary = DT_trainingTransform[[i]])
}
names(DT_training_MV) <- nameFeature
## Progress message
message(paste0("Estimating priors for fold ", folds, "."))
## Estimate priors from training data
DT_prior <- featurePrior(DT_featureLibrary = DT_training[[1]], DT_methodMassInfo = massMethod)
selectCol <- c("library.Q1", "library.Q3", "Barcode", "matrix", "prior")
DT_prior <- DT_prior[, selectCol, with=F]
DT_prior <- DT_prior[!duplicated(DT_prior[,c("Barcode", "matrix")]),]
rm(selectCol)
## Melt tuning data
for(i in seq_along(DT_tuningTransform)){
DT_tuningTransform[[i]] <- meltDT(DT_tuningTransform[[i]])
}
names(DT_tuningTransform) <- nameFeature
## Melt testing data
for(i in seq_along(DT_testingTransform)){
DT_testingTransform[[i]] <- meltDT(DT_testingTransform[[i]])
}
names(DT_testingTransform) <- nameFeature
rm(i)
## Progress message
message("Generating feature combinations.")
## Matrix of feature combinations
listFeatureChoices <- combinations(chooseK = length(indexFeature),
nameFeature = nameFeature)
## Collection vector of incorrect assignments for tuning and testing set
if(folds == 1){
mismatchTuning <- vector(mode = "numeric",
length=nrow(listFeatureChoices))
mismatchTuning <- lapply(seq_len(cvFolds),
function(x) mismatchTuning)
mismatchTesting <- vector(mode = "numeric",
length=nrow(listFeatureChoices))
mismatchTesting <- lapply(seq_len(cvFolds),
function(x) mismatchTesting)
}
## Progress message
message(paste0("Estimating posterior probabilities based on (transformed) feature combinations for fold ", folds, "."))
## Naive Bayes
# output[[1]] returns all candidate assignments and posteriors
# output[[2]] rows are best posteriors according to MWBM
# output[[3]] rows are incorrect assignments
for(i in 1:nrow(listFeatureChoices)){
featureChoice <- nameFeature[as.logical(listFeatureChoices[i,])]
## Naive Bayes for tuning set
nbTuning <- NB(DT = DT_tuningTransform,
Prior = DT_prior,
Likelihood_MV = DT_training_MV,
tolerance = tolerance,
Training = featureChoice)
mismatchTuning[[folds]][i] <- nrow(nbTuning$incorrect)
## Naive Bayes for testing set
nbTesting <- NB(DT = DT_testingTransform,
Prior = DT_prior,
Likelihood_MV = DT_training_MV,
tolerance = tolerance,
Training = featureChoice)
mismatchTesting[[folds]][i] <- nrow(nbTesting$incorrect)
## Collect MWBM assignments
DT_Full[[folds]][[i]] <- nbTesting[[1]]
}
rm(DT_trainingTransform, DT_testingTransform, DT_tuningTransform, i)
save(DT_Full, file = paste("Output_CV_", folds, ".rda", sep=""))
DT_Full[[folds]] <- NA
## Progress message
message(paste("Completed cross-validation fold", folds, "at", Sys.time(), sep=" "))
}
rm(DT_training_MV, DT_folds, DT_testing, DT_training, DT_tuning, p_KS)
## Creating feature combination matrix of false positives
featureCombinationFP <- cbind(listFeatureChoices,
Reduce("+", mismatchTuning),
Reduce("+", mismatchTesting))
## Loading Naive Bayes output and subsetting feature combinations to top k
nbOutput <- loadTopcvfNB(k = k,
listFeatureChoices = listFeatureChoices,
mismatchTuning = mismatchTuning,
mismatchTesting = mismatchTesting,
cvFolds = cvFolds)
## Calculating maximum a posteriori (MAP) (with/without duplicate barcode) assignments
## for every fold of the top k feature combinations (chosen by MWBM)
nbOutput <- dupMAP(nbOutput = nbOutput)
nbOutput <- nodupMAP(nbOutput = nbOutput)
## Generate confusion matrices for MAP (with/without duplicate barcode) assignments and
## MWBM across folds for all top k feature combinations
dupMat <- confusioncvfNB(nbOutput = nbOutput,
nameColTF = "dupMAP")
nodupMat <- confusioncvfNB(nbOutput = nbOutput,
nameColTF = "nodupMAP")
mwbmMat <- confusioncvfNB(nbOutput = nbOutput,
nameColTF = "MWBM")
## Create a table of isobaric transitions across all testing data folds
isobarMat <- isobarcvfNB(nbOutput = nbOutput)
## Finding top k combinations and replace 1/0 matrix with column names
interestingCombos <- topcvfNB(k = k,
listFeatureChoices = listFeatureChoices,
mismatchTuning = mismatchTuning,
mismatchTesting = mismatchTesting)
## Replace 1/0 with appropriate feature name
for(i in seq_along(interestingCombos)){
ones <- which(listFeatureChoices[interestingCombos[i], ] == 1, arr.ind=T)
listFeatureChoices[interestingCombos[i],][ones] <- colnames(listFeatureChoices)[ones]
zeros <- which(listFeatureChoices[interestingCombos[i], ] == "0", arr.ind=T)
listFeatureChoices[interestingCombos[i],][zeros] <- ""
}
## Generate plots for each fold
dir.create(file.path("posterior_histograms"), showWarnings = FALSE)
dir <- getwd()
for(i in 1:cvFolds){
setwd(dir)
dir.create(file.path(paste0(dir, "/posterior_histograms/fold_", i)), showWarnings = FALSE)
setwd(paste0(dir, "/posterior_histograms/fold_", i))
for(j in 1:k){
## Plot posterior probability histograms of the top k feature combinations for nodupMAP and MWBM
g <- histPosteriorcvfNB(DT = nbOutput[[i]][[j]],
matNameFeature = listFeatureChoices[interestingCombos[j], ],
scaling = "log10",
binWidth = 1)
ggsave(g, filename = paste0("Histogram_featureCombination_", j), device = "pdf", width = 8, height = 6)
}
}
setwd(dir)
#dir.create(file.path("QC_plots"), showWarnings = FALSE)
#for(i in 1:cvFolds){
# setwd(dir)
# dir.create(file.path(paste0(dir, "/QC_plots/fold_", i)), showWarnings = FALSE)
# setwd(paste0(dir, "/QC_plots/fold_", i))
# for(j in 1:k){
# for(l in 1:nbOutput[[i]][[j]][, unique(run)]){
# g <- histPosteriorcvfNB(DT = nbOutput[[i]][[j]],
# matNameFeature = listFeatureChoices[interestingCombos[j], ],
# scaling = "log10",
# binWidth = 1)
# ggsave(g, filename = paste0("QC_featureCombination_", j), device = "pdf", width = 8, height = 6)
# }
# }
#}
#setwd(dir)
## Save all outputs into a combined list
summary <- list(nbOutput=nbOutput,
confusionDupMat=dupMat,
confusionNodupMat=nodupMat,
confusionMwbmMat=mwbmMat,
isobarMat=isobarMat,
featureCombinationFP=featureCombinationFP,
percentNorm=percentNorm,
topKCombinations=listFeatureChoices[interestingCombos, ])
save(summary, file = "cvfNB_summary.rda")
return(summary)
}
# Function ends
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.