## --------------------------------------------------------------------------
##
## This file is part of the EBBC software package.
##
## Version 1.0 - January 2020
##
##
## The EBBC package is free software; you can use it, redistribute it,
## and/or modify it under the terms of the GNU General Public License
## version 3 as published by the Free Software Foundation. The full text
## of the license can be found in the file LICENSE.txt at the top level of
## the package distribution.
##
## Authors:
## Michele Castelluzzo (1), Alessio Perinelli (1),
## Michela A. Denti (2) and Leonardo Ricci (1,3)
## (1) Department of Physics, University of Trento, 38123 Trento, Italy
## (2) Department of Cellular, Computational and Integrative Biology
## (CIBIO), University of Trento, 38123 Trento, Italy
## (3) CIMeC, Center for Mind/Brain Sciences, University of Trento,
## 38068 Rovereto, Italy
##
## michele.castelluzzo@unitn.it
## alessio.perinelli@unitn.it
## michela.denti@unitn.it
## leonardo.ricci@unitn.it
## https://github.com/LeonardoRicci/
## https://nse.physics.unitn.it/
##
##
## If you use the EBBC package for your analyses, please cite:
##
## L. Ricci, V. Del Vescovo, C. Cantaloni, M. Grasso, M. Barbareschi and
## M. A. Denti, Statistical analysis of a Bayesian classifier based on the
## expression of miRNAs, BMC Bioinformatics 16:287 (2015).
## DOI: 10.1186/s12859-015-0715-9
##
##
## --------------------------------------------------------------------------
#' Analysis of features and training of classifiers.
#'
#' This function carries out different tasks depending on the input parameters:
#' --> Analysis mode: analyzes the properties of each feature (possibly subtracting a normalizer) in terms of Target/Versus separation, normality, etc. A matrix of correlation coefficients between each pair of features is also assessed.
#' --> Training mode: trains a Bayesian classifier by assessing the corresponding optimal threshold values and the related uncertainties.
#'
#' In order to select between Analysis and Training mode, the input parameters "inputFeaturesList" and "coeffList" have to comply with the following requirements.
#' --> Analysis mode: "coeffList" has to be empty (i.e. omitted in the function call arguments). "inputFeaturesList" can either be empty (i.e. omitted in the function call arguments) or of length 1: in the latter case, the single entry of "inputFeaturesList" is assumed to be the normalizer.
#' --> Training mode: "inputFeaturesList" and "coeffList" have to be non-empty and of the same size.
#'
#' @param inputDataset Dataset (data frame) to be used for the analysis/training. The data frame must comply with the output format of the ebbc preprocessing functions (ebbc_preprocess and ebbc_removeOutliers), thus containing the columns 'Subject', 'Feature', 'Mean', 'StdDev', 'SampleSize', 'Class'. Any other column is ignored, and any missing column forbids execution. Please note that in this case the 'Class' column is mandatory.
#' @param inputTargetList List of classes to use as target for the classification. The chosen target must correspond to at least one of the classes present in the 'Class' column of the inputDataset.
#' @param inputVersusList List of classes to use as versus for the classification. If the argument is left empty, all classes present in the 'Class' column of the inputDataset, minus the Target classes, are used as Versus.
#' @param inputFeaturesList List of features to be used by the classifier ('Training mode'). The chosen features must be present in the 'Feature' column of the inputDataset. In 'Analysis mode', this argument has to be omitted (if no normalizer has to be used) or has to contain a single entry (corresponding to the feature to be used as normalizer).
#' @param coeffList List of coefficients for the classifier. In 'Training mode', the number of coefficients must be the same as the number of used features and listed in the same order. In 'Analysis mode', this argument has to be omitted.
#' @param outputFileBasename Name of the output file where the training results ('Training mode') or the analysis results ('Analysis mode') are to be stored. If not assigned, a filename is automatically generated. File names of other files created by the function are generated by appending suitable labels to the provided "outputFileBasename".
#' @param sep Field separator character for the output files; the default is tabulation.
#' @param plotFormat String specifying the format of generated graphic files (plots): can either be "pdf" (default) or "png".
#'
#' Beware! Cross-correlation coefficients, as well as Shapiro-Wilk tests for normality, require at least three data samples. In case of less than three samples, those tests are skipped and "NA" (not available) is reported in the corresponding output.
#'
#' @return In 'Analysis mode', a data framce containing the columns 'Feature', 'Classification', 'NumberOfSubjects', 'Mean', 'StdDev', 'NormalityTest', 't-test'. In 'Training mode', a data frame containing the columns 'Threshold', 'DeltaThreshold', 'DPrime', 'Pc', 'ChiUp', 'DChiUp', 'ChiDown', 'DChiDown'.
#' @export
ebbc_bayes <- function(inputDataset, inputTargetList, inputVersusList=character(), inputFeaturesList=character(), coeffList=double(), outputFileBasename="", sep='\t', plotFormat="pdf") {
cat("\nLOG: ebbc_bayes() called.\n")
## Input validation and pre-processing
if (!(("Subject" %in% colnames(inputDataset)) & ("Feature" %in% colnames(inputDataset)) & ("Mean" %in% colnames(inputDataset)) & ("StdDev" %in% colnames(inputDataset)) & ("SampleSize" %in% colnames(inputDataset) & ("Class" %in% colnames(inputDataset))))) {
cat("ERROR: unsuitable dataset format. Dataset must contain columns 'Subject', 'Feature', 'Mean', 'StdDev', 'SampleSize', 'Class'.\n")
return(NULL)
}
if (length(inputDataset[1,]) > 6) {
cat("WARNING: more than 6 dataset columns. Columns other than 'Subject', 'Feature', 'Mean', 'StdDev', 'SampleSize', 'Class' will be ignored.\n")
}
availableClasses <- levels(inputDataset$Class)
# Select subjects based on target list
listOfTargets <- unique(inputTargetList)
if (length(listOfTargets) != length(inputTargetList)){
cat("WARNING: target list presents some duplicates which will be ignored.\n")
}
inputTargetList <- unique(inputTargetList)
listOfTargets <- intersect(listOfTargets, availableClasses)
if (length(listOfTargets) != length(inputTargetList)){
cat("WARNING: some of the target classes are not present in the dataset and will be ignored.\n")
}
# Select subjects based on versus list
if (length(inputVersusList) == 0) {
inputVersusList <- setdiff(availableClasses, listOfTargets)
inputVersusList <- unique(inputVersusList)
}
listOfVersus <- unique(inputVersusList)
if (length(listOfVersus) != length(inputVersusList)){
cat("WARNING: versus list presents some duplicates which will be ignored.\n")
}
inputVersusList <- unique(inputVersusList)
listOfVersus <- intersect(listOfVersus, availableClasses)
if (length(listOfVersus) != length(inputVersusList)){
cat("WARNING: some of the versus classes are not present in the dataset and will be ignored.\n")
}
# Check target and versus list are non-empty and not intersecting
if (length(listOfTargets)==0 | length(listOfVersus)==0) {
cat("ERROR: unsuitable function arguments. The requested target and/or versus classes are empty, or not present in the dataset.\n")
return(NULL)
} else if (length(intersect(listOfTargets, listOfVersus))) {
cat("ERROR: conflicting function arguments; target set and versus set have non-empty intersection.\n")
return(NULL)
}
targetVersusFrame <- inputDataset[inputDataset$Class %in% c(listOfTargets,listOfVersus), ]
subjectsTargetVersus <- unique(targetVersusFrame$Subject)
# Check for duplicates features in input list
availableFeatures <- unique(targetVersusFrame$Feature)
listOfFeature <- unique(inputFeaturesList)
if (length(listOfFeature) != length(inputFeaturesList)){
cat("WARNING: features list presents some duplicates which will be ignored.\n")
}
inputFeaturesList <- unique(inputFeaturesList)
# Check for features from the list which are not present in the dataset
listOfFeature <- intersect(listOfFeature, availableFeatures)
if (length(listOfFeature) != length(inputFeaturesList)){
cat("ERROR: some entries of the features list are not present in the dataset.\n")
return(NULL)
}
# Correct for empty separator (default behavior of write.table is sep=" ")
if (sep == "")
sep <- " "
## Select function behavior
if ((length(inputFeaturesList) == 1) && (length(coeffList) == 0)) {
normalizer <- inputFeaturesList
normalizerFlag <- 1
classifierFlag <- 0
cat("LOG:\tebbc_bayes() is running in Analysis mode (with normalizer).\n")
} else if ((length(inputFeaturesList) == 0) && (length(coeffList) == 0)) {
normalizerFlag <- 0
classifierFlag <- 0
cat("LOG:\tebbc_bayes() is running in Analysis mode (without normalizer).\n")
} else if (length(inputFeaturesList) == length(coeffList)) {
normalizerFlag <- 0
classifierFlag <- 1
cat("LOG:\tebbc_bayes() is running in Training mode.\n")
} else {
cat("ERROR: unsuitable input parameters. The list of features and the list of coefficients do not match any valid pattern.\n\tType help(ebbc_bayes) for help.\n")
return(NULL)
}
## Actual computation starts here (loop over features; if classifierFlag==1, stops after first iteration)
for (currentFeature in availableFeatures) {
if ((normalizerFlag == 1) && (classifierFlag == 0)) {
if (currentFeature == normalizer)
next
listOfFeature <- c(currentFeature, normalizer)
listOfFeature <- unique(listOfFeature)
coeffList <- c(1, -1)
} else if ((normalizerFlag == 0) && (classifierFlag == 0)) {
listOfFeature <- currentFeature
coeffList <- 1
} else if ((normalizerFlag == 0) && (classifierFlag == 1)) {
listOfFeature <- inputFeaturesList
}
## Pre-Processing: Filter subjects by needed features
if (exists("subjectsToRemove"))
rm(subjectsToRemove)
for (subject in subjectsTargetVersus) {
subjectFrame <- targetVersusFrame[targetVersusFrame$Subject == subject,]
availableSubjectFeatures <- unique(subjectFrame$Feature)
if (length(listOfFeature) != length(intersect(availableSubjectFeatures, listOfFeature))) {
if (!exists("subjectsToRemove")) {
subjectsToRemove <- subject
} else {
subjectsToRemove <- rbind(subjectsToRemove, subject)
}
}
}
if (exists("subjectsToRemove")) {
subjectsToRemove <- unique(subjectsToRemove)
compliantSubjects <- setdiff(subjectsTargetVersus, subjectsToRemove)
} else {
compliantSubjects <- subjectsTargetVersus
}
if (length(compliantSubjects) == 0) {
cat("WARNING: no available subjects for features:\t", listOfFeature, "\n")
next
}
availableDataset <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects, ]
availableDataset <- availableDataset[availableDataset$Feature %in% listOfFeature, ]
if (length(availableDataset[availableDataset$Class %in% listOfTargets, 1]) == 0 | length(availableDataset[availableDataset$Class %in% listOfVersus, 1]) == 0) {
if (classifierFlag) {
cat("ERROR: No available target and/or versus subjects for this classifier.\n")
return(NULL)
} else {
cat("WARNING: No available target and/or versus subjects for feature '", currentFeature, "', continuing to next feature.\n")
next
}
}
## Pre-Processing: prepare dataframe in suitable format
dataFrameTemp <- availableDataset[availableDataset$Class %in% listOfTargets, ]
if (exists("tempFrame"))
rm(tempFrame)
for (feature in listOfFeature) {
columnSubjectMean <- dataFrameTemp[dataFrameTemp$Feature == feature,]
columnSubjectMean <- subset(columnSubjectMean, select=c("Subject", "Mean"))
if (!exists("tempFrame")) {
tempFrame <- columnSubjectMean
columnSubjectClass <- dataFrameTemp[dataFrameTemp$Feature == feature,]
columnSubjectClass <- subset(columnSubjectClass, select=c("Subject", "Class"))
} else {
tempFrame <- merge(tempFrame, columnSubjectMean, by = "Subject")
}
}
tempFrame <- merge(tempFrame, columnSubjectClass, by = "Subject")
completeDataFrame <- cbind(tempFrame, Classification=rep("target", length(tempFrame[,1])))
dataFrameTemp <- availableDataset[availableDataset$Class %in% listOfVersus, ]
if (exists("tempFrame"))
rm(tempFrame)
for (feature in listOfFeature) {
columnSubjectMean <- dataFrameTemp[dataFrameTemp$Feature == feature,]
columnSubjectMean <- subset(columnSubjectMean, select=c("Subject", "Mean"))
if (!exists("tempFrame")) {
tempFrame <- columnSubjectMean
columnSubjectClass <- dataFrameTemp[dataFrameTemp$Feature == feature,]
columnSubjectClass <- subset(columnSubjectClass, select=c("Subject", "Class"))
} else
tempFrame <- merge(tempFrame, columnSubjectMean, by = "Subject")
}
tempFrame <- merge(tempFrame, columnSubjectClass, by = "Subject")
completeDataFrame <- rbind(completeDataFrame, cbind(tempFrame, Classification=rep("versus", length(tempFrame[,1]))))
names(completeDataFrame) <- c("Subject", listOfFeature, "Class", "Classification")
## Processing: 1. Compute Score, i.e. linear combination of features
dataFrameTemp <- completeDataFrame[2:(1+length(listOfFeature))]
for (feature in listOfFeature)
dataFrameTemp[,feature] <- dataFrameTemp[,feature] * as.numeric(coeffList[which(listOfFeature == feature)])
completeDataFrame <- cbind(completeDataFrame, Score=rowSums(dataFrameTemp))
completeDataFrame <- data.frame(completeDataFrame[with(completeDataFrame, order(Score)),])
## Processing: 2. Means, StdDevs, NormalityTest
# Target
nT <- length(completeDataFrame$Score[completeDataFrame$Classification == "target"])
xT <- mean(completeDataFrame$Score[completeDataFrame$Classification == "target"])
sT <- stats::sd(completeDataFrame$Score[completeDataFrame$Classification == "target"])
dxT <- sT/sqrt(nT)
dsT <- sT/sqrt(2*(nT-1))
if (!classifierFlag) {
if (length(completeDataFrame$Score[completeDataFrame$Classification == "target"]) > 2) {
res <- stats::shapiro.test(completeDataFrame$Score[completeDataFrame$Classification == "target"])
shapiroWilkTarget <- (res[[2]])
} else {
shapiroWilkTarget <- NA
cat("WARNING: Feature '", currentFeature, "' has less than 3 Subjects in the target set. Cannot carry out Shapiro-Wilk test.\n")
}
}
# Versus
nV <- length(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
xV <- mean(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
sV <- stats::sd(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
dxV <- sV/sqrt(nV)
dsV <- sV/sqrt(2*(nV-1))
if (!classifierFlag) {
if (length(completeDataFrame$Score[completeDataFrame$Classification == "versus"]) > 2) {
res <- stats::shapiro.test(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
shapiroWilkVersus <- (res[[2]])
} else {
shapiroWilkVersus <- NA
cat("WARNING: Feature '", currentFeature, "' has less than 3 Subjects in the versus set. Cannot carry out Shapiro-Wilk test.\n")
}
}
# Target and Versus
nTV <- length(completeDataFrame$Score)
xTV <- mean(completeDataFrame$Score)
sTV <- stats::sd(completeDataFrame$Score)
dxTV <- sTV/sqrt(nTV)
dsTV <- sTV/sqrt(2*(nTV-1))
if (!classifierFlag) {
if (length(completeDataFrame$Score) > 2) {
res <- stats::shapiro.test(completeDataFrame$Score)
shapiroWilk <- (res[[2]])
} else {
shapiroWilkVersus <- NA
cat("WARNING: Feature '", currentFeature, "' has less than 3 Subjects in the dataset. Cannot carry out Shapiro-Wilk test.\n")
}
}
localPrecision <- 1
local_muAndS_precision <- localPrecision
## Processing: 3. Student's t Test between target and versus Scores
tTestRes <- stats::t.test(completeDataFrame$Score[completeDataFrame$Classification == "target"], completeDataFrame$Score[completeDataFrame$Classification == "versus"], var.equal = FALSE)
## Processing: 4. d'
d <- xT - xV
dd <- sqrt((sT*sT + sV*sV)/2.)
dee <- d/dd
dDee <- sqrt(((dxT/dd)**2) + ((dxV/dd)**2) + ((dee/dd/dd/2)**2)*(sT*sT*dsT*dsT + sV*sV*dsV*dsV))
## Processing: 5. In the case of Classification, compute thresholds
if (classifierFlag) {
## Auxiliary function: computes chi, accuracy and the respective errors
compute_chi_dChi <- function(xT, xV, sT, sV, dxT, dxV, dsT, dsV, ratio) {
options(warn=-1)
epsilon <- 1.e-6
chiPc <- compute_chi_dChi_aux(xT, xV, sT, sV, ratio)
dchiPc_dxt <- (compute_chi_dChi_aux(xT + epsilon/2., xV, sT, sV, ratio) - compute_chi_dChi_aux(xT - epsilon/2., xV, sT, sV, ratio))/epsilon
dchiPc_dxv <- (compute_chi_dChi_aux(xT, xV + epsilon/2., sT, sV, ratio) - compute_chi_dChi_aux(xT, xV - epsilon/2., sT, sV, ratio))/epsilon
dchiPc_dst <- (compute_chi_dChi_aux(xT, xV, sT + epsilon/2., sV, ratio) - compute_chi_dChi_aux(xT, xV, sT - epsilon/2., sV, ratio))/epsilon
dchiPc_dsv <- (compute_chi_dChi_aux(xT, xV, sT, sV + epsilon/2., ratio) - compute_chi_dChi_aux(xT, xV, sT, sV - epsilon/2., ratio))/epsilon
dchiPc <- dchiPc_dxt*dchiPc_dxt*dxT*dxT + dchiPc_dxv*dchiPc_dxv*dxV*dxV + dchiPc_dst*dchiPc_dst*dsT*dsT + dchiPc_dsv*dchiPc_dsv*dsV*dsV
dchiPc <- sqrt(dchiPc)
options(warn=0)
return(c(chiPc, dchiPc))
}
## Auxiliary function: computes treshold value chi and accuracy (called by compute_chi_dChi)
compute_chi_dChi_aux <- function(xT, xV, sT, sV, ratio) {
alpha <- 1/sT/sT - 1/sV/sV
beta <- xT/sT/sT - xV/sV/sV
gamma <- xT*xT/sT/sT - xV*xV/sV/sV - 2*log(sV/sT) + 2*log(ratio)
chi <- as.numeric(polyroot(c(gamma, -2*beta, alpha)))
pc <- (ratio*stats::pnorm((xT-chi)/sT) + stats::pnorm((chi-xV)/sV))/(ratio + 1)
return(c(chi,pc))
}
# pv/pt = 1
chi_dChi <- compute_chi_dChi(xT, xV, sT, sV, dxT, dxV, dsT, dsV, 1)
chi <- chi_dChi[1:2]
pc <- chi_dChi[3:4]
dchi <- chi_dChi[5:6]
dpc <- chi_dChi[7:8]
dpc <- dpc[which.min(((chi-(xT + xV)/2)**2))]
pc <- pc[which.min(((chi-(xT + xV)/2)**2))]
dchi <- dchi[which.min(((chi-(xT + xV)/2)**2))]
chi <- chi[which.min(((chi-(xT + xV)/2)**2))]
# pv/pt = likelyhoodRatio: 90% Target threshold
chi_dChi <- compute_chi_dChi(xT, xV, sT, sV, dxT, dxV, dsT, dsV, 9.)
chiUp <- chi_dChi[1:2]
dchiUp <- chi_dChi[5:6]
dchiUp <- dchiUp[which.min(((chi-(xT + xV)/2)**2))]
chiUp <- chiUp[which.min(((chi-(xT + xV)/2)**2))]
# pv/pt = likelyhoodRatio: 90% Versus threshold
chi_dChi <- compute_chi_dChi(xT, xV, sT, sV, dxT, dxV, dsT, dsV, 1./9.)
chiDown <- chi_dChi[1:2]
dchiDown <- chi_dChi[5:6]
dchiDown <- dchiDown[which.min(((chi-(xT + xV)/2)**2))]
chiDown <- chiDown[which.min(((chi-(xT + xV)/2)**2))]
# ROC and corresponding area under curve
pred <- completeDataFrame$Score
obs <- completeDataFrame$Classification
pROC_data <- suppressMessages(pROC::roc(obs, pred))
ciAuc <- suppressMessages(pROC::ci.auc(pROC_data))
outputThresholdFrame <- data.frame(chi, dchi, dee, pc, chiUp, dchiUp, chiDown, dchiDown, ciAuc[2], ciAuc[1], ciAuc[3])
names(outputThresholdFrame) <- c("Threshold", "DeltaThreshold", "DPrime", "Pc", "ChiUp", "DChiUp", "ChiDown", "DChiDown", "AUC", "AUCDown", "AUCUp")
}
## Processing: 5b. In the case of feature analysis (not Classification), prepare output
completeDataFrame <- subset(completeDataFrame, select = c("Subject", "Classification", "Score", listOfFeature))
if (!classifierFlag) {
if (normalizerFlag) {
rowTarget <- c(paste("Δ", currentFeature, sep=""), "target", nT, round(xT,local_muAndS_precision), round(sT,local_muAndS_precision), round(shapiroWilkTarget,2), signif(tTestRes[[3]],2))
rowVersus <- c(paste("Δ", currentFeature, sep=""), "versus", nV, round(xV,local_muAndS_precision), round(sV,local_muAndS_precision), round(shapiroWilkVersus,2), signif(tTestRes[[3]],2))
} else {
rowTarget <- c(currentFeature, "target", nT, round(xT,local_muAndS_precision), round(sT,local_muAndS_precision), round(shapiroWilkTarget,2), signif(tTestRes[[3]],2))
rowVersus <- c(currentFeature, "versus", nV, round(xV,local_muAndS_precision), round(sV,local_muAndS_precision), round(shapiroWilkVersus,2), signif(tTestRes[[3]],2))
}
if (!exists("outputStatisticsFrame")) {
outputStatisticsFrame <- rowTarget
outputStatisticsFrame <- rbind(outputStatisticsFrame, rowVersus)
} else {
outputStatisticsFrame <- rbind(outputStatisticsFrame, rowTarget)
outputStatisticsFrame <- rbind(outputStatisticsFrame, rowVersus)
}
}
## Output: filename building
if (exists("classifierLabel")) {
rm(classifierLabel)
}
for (i in 1:length(coeffList)) {
if(!exists("classifierLabel"))
classifierLabel <- paste(coeffList[i], listOfFeature[i], sep="_")
else
classifierLabel <- paste(classifierLabel, coeffList[i], listOfFeature[i], sep="_")
}
targetLabel <- paste(listOfTargets, sep="", collapse="_")
versusLabel <- paste(listOfVersus, sep="", collapse="_")
targetVSVersusLabel <- paste(targetLabel, "vs", versusLabel, sep="_", collapse="")
## Output: write to file (classifier case)
if (classifierFlag) {
if (outputFileBasename == "") {
thresholdFileName <- paste(targetVSVersusLabel, "u", classifierLabel, sep="_")
thresholdFileName <- paste(thresholdFileName, "_thresholds", ".txt", sep="")
scoresFileName <- paste(thresholdFileName, "_scores", ".dat", sep="")
} else {
thresholdFileName <- paste(outputFileBasename, ".txt", sep="")
scoresFileName <- paste(outputFileBasename, ".dat", sep="")
}
if (file.exists(thresholdFileName) & file.access(thresholdFileName, mode=2)) {
cat("ERROR:\tcannot write ", thresholdFileName, ". Check write permission.\n")
return(NULL)
}
if (file.exists(scoresFileName) & file.access(scoresFileName, mode=2)) {
cat("ERROR:\tcannot write ", scoresFileName, ". Check write permission.\n")
return(NULL)
}
cat("#classifier: ", file=thresholdFileName)
for (i in 1:length(coeffList)) {
cat(paste(coeffList[i], "*", listOfFeature[i], sep=""), " ", file=thresholdFileName, append = TRUE)
}
cat("\n", file=thresholdFileName, append = TRUE)
options(warn=-1)
utils::write.table(format(outputThresholdFrame, drop0trailing=FALSE), file=thresholdFileName, append=TRUE, sep=sep, row.names=FALSE, col.names=TRUE, quote=FALSE)
utils::write.table(format(subset(completeDataFrame, select=c("Subject", "Classification", "Score")), drop0trailing=FALSE), file=scoresFileName, append=TRUE, sep=sep, row.names=FALSE, col.names=TRUE, quote=FALSE)
options(warn=0)
cat("LOG:\tThresholds data frame written to ", thresholdFileName, " successfully.\n", sep="")
cat("LOG:\tScores data frame written to ", scoresFileName, " successfully.\n", sep="")
}
## Output: plot (all cases)
if (outputFileBasename == "") {
plotFileName <- paste(targetVSVersusLabel, "u", classifierLabel, sep="_")
} else {
if (classifierFlag) {
plotFileName <- outputFileBasename
} else {
plotFileName <- paste(outputFileBasename, currentFeature, sep="_")
}
}
if (classifierFlag) {
histPlot <- ebbc_plotHistograms(completeDataFrame, outputThresholdFrame, plotFileName, plotFormat=plotFormat)
cat("LOG:\tHistogram plot saved to ", plotFileName, "_histogram.", plotFormat, " successfully.\n", sep="")
thresholdPlot <- ebbc_plotThresholds(completeDataFrame, outputThresholdFrame, plotFileName, plotFormat=plotFormat)
cat("LOG:\tScore classification plot saved to ", plotFileName, "_score.", plotFormat, " successfully.\n", sep="")
rocPlot <- ebbc_plotROC(completeDataFrame, plotFileName, plotFormat=plotFormat)
cat("LOG:\tROC plot saved to ", plotFileName, "_ROC.", plotFormat, " successfully.\n", sep="")
} else {
histPlot <- ebbc_plotHistograms(completeDataFrame, outputFileLabel=plotFileName, plotFormat=plotFormat)
cat("LOG:\tHistogram plot saved to ", plotFileName, "_histogram.", plotFormat, " successfully.\n", sep="")
}
if (classifierFlag)
break
}
if ((normalizerFlag == 1) && (classifierFlag == 0)) {
normalizer <- inputFeaturesList
listOfFeature <- setdiff(availableFeatures, normalizer)
normalizerFlag <- 1
} else if ((normalizerFlag == 0) && (classifierFlag == 0)) {
listOfFeature <- unique(inputDataset$Feature)
} else if ((normalizerFlag == 0) && (classifierFlag == 1)) {
listOfFeature <- inputFeaturesList
}
## Output: write to file (analysis case)
if (!classifierFlag) {
outputStatisticsFrame <- data.frame(outputStatisticsFrame, row.names=NULL)
names(outputStatisticsFrame) <- c("Feature", "Classification", "NumberOfSubjects", "Mean", "StdDev", "NormalityTest", "t-test")
if (outputFileBasename == "") {
if (normalizerFlag) {
statisticsFileName <- paste(targetVSVersusLabel, "norm", normalizer, "statistics", sep="_")
} else {
statisticsFileName <- paste(targetVSVersusLabel, "statistics", sep="_")
}
statisticsFileName <- paste(statisticsFileName, ".txt", sep="")
} else {
statisticsFileName <- paste(outputFileBasename, ".txt", sep="")
}
if (file.exists(statisticsFileName) & file.access(statisticsFileName, mode=2)) {
cat("ERROR: cannot write ", statisticsFileName, ". Check write permission.\n")
return(NULL)
}
options(warn=-1)
utils::write.table(format(outputStatisticsFrame, drop0trailing=FALSE), file=statisticsFileName, sep=sep, row.names=FALSE, col.names=TRUE, quote=FALSE)
options(warn=0)
## Processing: Correlation coefficients
targetLabel <- paste(listOfTargets, sep="", collapse="_")
versusLabel <- paste(listOfVersus, sep="", collapse="_")
targetVSVersusLabel <- paste(targetLabel, "vs", versusLabel, sep="_", collapse="")
localPrecision <- 2
if (normalizerFlag) {
matrixLabels <- paste("\u0394", listOfFeature, sep='')
} else {
matrixLabels <- listOfFeature
}
cat("\n", append=TRUE, file=statisticsFileName)
for (classificationLabel in c("target", "versus")) {
cat("###############################\n\n", append=TRUE, file=statisticsFileName)
if (classificationLabel=="target")
cat("## Evaluation on subjects whose classification belongs to the Target set\n", append=TRUE, file=statisticsFileName)
else
cat("## Evaluation on subjects whose classification belongs to the Versus set\n", append=TRUE, file=statisticsFileName)
cat("# Correlation coefficients:\n", append=TRUE, file=statisticsFileName)
cat("\tr", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
cat("\n", append=TRUE, file=statisticsFileName)
for (a in listOfFeature) {
cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, ]
if (classificationLabel == "target") {
subjectsA <- subjectsA[subjectsA$Class %in% listOfTargets, "Subject"]
} else if (classificationLabel == "versus") {
subjectsA <- subjectsA[subjectsA$Class %in% listOfVersus, "Subject"]
}
for (b in listOfFeature) {
if (which(listOfFeature == a) < which(listOfFeature == b)) {
subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
if (normalizerFlag) {
subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
compliantSubjects <- intersect(subjectsA, subjectsB)
compliantSubjects <- intersect(compliantSubjects, subjectsN)
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
} else {
compliantSubjects <- intersect(subjectsA, subjectsB)
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
}
if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
cat(sep=sep, "", round(stats::cor(valuesA, valuesB), localPrecision), append=TRUE, file=statisticsFileName)
} else {
cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
cat("WARNING: Correlation between '", a, "', '", b, "' cannot be computed for the ", classificationLabel, " set: less than 3 values.\n", sep="")
}
} else {
cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
}
}
cat("\n", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
cat(sep="", "# Correlation p-values \"", classificationLabel, "\":\n", append=TRUE, file=statisticsFileName)
cat("\tp", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
cat("\n", append=TRUE, file=statisticsFileName)
for (a in listOfFeature) {
cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, ]
if (classificationLabel == "target") {
subjectsA <- subjectsA[subjectsA$Class %in% listOfTargets, "Subject"]
} else if (classificationLabel == "versus") {
subjectsA <- subjectsA[subjectsA$Class %in% listOfVersus, "Subject"]
}
for (b in listOfFeature) {
if (which(listOfFeature == a) < which(listOfFeature == b)) {
subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
if (normalizerFlag) {
subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
compliantSubjects <- intersect(subjectsA, subjectsB)
compliantSubjects <- intersect(compliantSubjects, subjectsN)
if (length(compliantSubjects) == 0) next
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
} else {
compliantSubjects <- intersect(subjectsA, subjectsB)
if (length(compliantSubjects) == 0) next
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
}
if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
ctst <- stats::cor.test(valuesA, valuesB)
cat(sep=sep, "", signif(ctst[[3]], localPrecision), append=TRUE, file=statisticsFileName)
} else {
cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
cat("WARNING: Correlation (p-value) between '", a, "', '", b, "' cannot be computed for the ", classificationLabel, " set: less than 3 values.\n", sep="")
}
} else {
cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
}
}
cat("\n", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
}
cat("###############################\n\n", append=TRUE, file=statisticsFileName)
cat("## Evaluation on subjects whose classification belongs to the union of Target and Versus sets\n", append=TRUE, file=statisticsFileName)
cat("# Correlation coefficients:\n", append=TRUE, file=statisticsFileName)
cat("\tr", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
cat("\n", append=TRUE, file=statisticsFileName)
for (a in listOfFeature) {
cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, "Subject"]
for (b in listOfFeature) {
if (which(listOfFeature == a) < which(listOfFeature == b)) {
subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
if (normalizerFlag) {
subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
compliantSubjects <- intersect(subjectsA, subjectsB)
compliantSubjects <- intersect(compliantSubjects, subjectsN)
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
} else {
compliantSubjects <- intersect(subjectsA, subjectsB)
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
}
if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
cat(sep=sep, "", round(stats::cor(valuesA, valuesB), localPrecision), append=TRUE, file=statisticsFileName)
} else {
cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
cat("WARNING: Correlation between '", a, "', '", b, "' cannot be computed: less than 3 values.\n", sep="")
}
} else {
cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
}
}
cat("\n", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
cat("# Correlation p-values:\n", append=TRUE, file=statisticsFileName)
cat("\tp", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
cat("\n", append=TRUE, file=statisticsFileName)
for (a in listOfFeature) {
cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, "Subject"]
for (b in listOfFeature) {
if (which(listOfFeature == a) < which(listOfFeature == b)) {
subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
if (normalizerFlag) {
subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
compliantSubjects <- intersect(subjectsA, subjectsB)
compliantSubjects <- intersect(compliantSubjects, subjectsN)
if (length(compliantSubjects) == 0) next
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
} else {
compliantSubjects <- intersect(subjectsA, subjectsB)
if (length(compliantSubjects) == 0) next
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
}
if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
ctst <- stats::cor.test(valuesA, valuesB)
cat(sep=sep, "", signif(ctst[[3]], localPrecision), append=TRUE, file=statisticsFileName)
} else {
cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
cat("WARNING: Correlation (p-value) between '", a, "', '", b, "' cannot be computed: less than 3 values.\n", sep="")
}
} else
cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
cat("# Epsilon:\n", append=TRUE, file=statisticsFileName)
cat("\teps", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
cat("\n", append=TRUE, file=statisticsFileName)
for (a in listOfFeature) {
cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, "Subject"]
for (b in listOfFeature) {
if (a != b) {
subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
if (normalizerFlag) {
subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
compliantSubjects <- intersect(subjectsA, subjectsB)
compliantSubjects <- intersect(compliantSubjects, subjectsN)
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
} else {
compliantSubjects <- intersect(subjectsA, subjectsB)
filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
}
if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
cat(sep=sep, "", round(-stats::cor(valuesA, valuesB)*stats::sd(valuesA)/stats::sd(valuesB), localPrecision), append=TRUE, file=statisticsFileName)
} else {
cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
cat("WARNING: Optimized coefficient between '", a, "', '", b, "' cannot be computed: less than 3 values.\n", sep="")
}
} else
cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
}
cat("\n", append=TRUE, file=statisticsFileName)
cat("LOG:\tFeature statistics and correlation matrices written to ", statisticsFileName, " successfully.\n", sep="")
}
if (classifierFlag) {
return(outputThresholdFrame)
} else {
return(outputStatisticsFrame)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.