R/ebbc_bayes.R

Defines functions ebbc_bayes

## --------------------------------------------------------------------------
##
## This file is part of the EBBC software package.
##
## Version 1.0 - January 2020
##
##
## The EBBC package is free software; you can use it, redistribute it,
## and/or modify it under the terms of the GNU General Public License
## version 3 as published by the Free Software Foundation. The full text
## of the license can be found in the file LICENSE.txt at the top level of
## the package distribution.
##
## Authors:
##	Michele Castelluzzo (1), Alessio Perinelli (1),
##		Michela A. Denti (2) and Leonardo Ricci (1,3)
##	(1) Department of Physics, University of Trento, 38123 Trento, Italy
##	(2) Department of Cellular, Computational and Integrative Biology
##		(CIBIO), University of Trento, 38123 Trento, Italy
##	(3) CIMeC, Center for Mind/Brain Sciences, University of Trento,
##		38068 Rovereto, Italy
##
##	michele.castelluzzo@unitn.it
##	alessio.perinelli@unitn.it
##	michela.denti@unitn.it
##	leonardo.ricci@unitn.it
##	https://github.com/LeonardoRicci/
##	https://nse.physics.unitn.it/
##
##
## If you use the EBBC package for your analyses, please cite:
##
##	L. Ricci, V. Del Vescovo, C. Cantaloni, M. Grasso, M. Barbareschi and
##	M. A. Denti, Statistical analysis of a Bayesian classifier based on the
##	expression of miRNAs, BMC Bioinformatics 16:287 (2015).
##	DOI: 10.1186/s12859-015-0715-9
##
##
## --------------------------------------------------------------------------

#' Analysis of features and training of classifiers.
#'
#' This function carries out different tasks depending on the input parameters:
#' --> Analysis mode: analyzes the properties of each feature (possibly subtracting a normalizer) in terms of Target/Versus separation, normality, etc. A matrix of correlation coefficients between each pair of features is also assessed.
#' --> Training mode: trains a Bayesian classifier by assessing the corresponding optimal threshold values and the related uncertainties.
#'
#' In order to select between Analysis and Training mode, the input parameters "inputFeaturesList" and "coeffList" have to comply with the following requirements.
#' --> Analysis mode: "coeffList" has to be empty (i.e. omitted in the function call arguments). "inputFeaturesList" can either be empty (i.e. omitted in the function call arguments) or of length 1: in the latter case, the single entry of "inputFeaturesList" is assumed to be the normalizer.
#' --> Training mode: "inputFeaturesList" and "coeffList" have to be non-empty and of the same size.
#'
#' @param inputDataset Dataset (data frame) to be used for the analysis/training. The data frame must comply with the output format of the ebbc preprocessing functions (ebbc_preprocess and ebbc_removeOutliers), thus containing the columns 'Subject', 'Feature', 'Mean', 'StdDev', 'SampleSize', 'Class'. Any other column is ignored, and any missing column forbids execution. Please note that in this case the 'Class' column is mandatory.
#' @param inputTargetList List of classes to use as target for the classification. The chosen target must correspond to at least one of the classes present in the 'Class' column of the inputDataset.
#' @param inputVersusList List of classes to use as versus for the classification. If the argument is left empty, all classes present in the 'Class' column of the inputDataset, minus the Target classes, are used as Versus.
#' @param inputFeaturesList List of features to be used by the classifier ('Training mode'). The chosen features must be present in the 'Feature' column of the inputDataset. In 'Analysis mode', this argument has to be omitted (if no normalizer has to be used) or has to contain a single entry (corresponding to the feature to be used as normalizer).
#' @param coeffList List of coefficients for the classifier. In 'Training mode', the number of coefficients must be the same as the number of used features and listed in the same order. In 'Analysis mode', this argument has to be omitted.
#' @param outputFileBasename Name of the output file where the training results ('Training mode') or the analysis results ('Analysis mode') are to be stored. If not assigned, a filename is automatically generated. File names of other files created by the function are generated by appending suitable labels to the provided "outputFileBasename".
#' @param sep Field separator character for the output files; the default is tabulation.
#' @param plotFormat String specifying the format of generated graphic files (plots): can either be "pdf" (default) or "png".
#'
#' Beware! Cross-correlation coefficients, as well as Shapiro-Wilk tests for normality, require at least three data samples. In case of less than three samples, those tests are skipped and "NA" (not available) is reported in the corresponding output.
#'
#' @return In 'Analysis mode', a data framce containing the columns 'Feature', 'Classification', 'NumberOfSubjects', 'Mean', 'StdDev', 'NormalityTest', 't-test'. In 'Training mode', a data frame containing the columns 'Threshold', 'DeltaThreshold', 'DPrime', 'Pc', 'ChiUp', 'DChiUp', 'ChiDown', 'DChiDown'.

#' @export
ebbc_bayes <- function(inputDataset, inputTargetList, inputVersusList=character(), inputFeaturesList=character(), coeffList=double(), outputFileBasename="", sep='\t', plotFormat="pdf") {

	cat("\nLOG: ebbc_bayes() called.\n")

	## Input validation and pre-processing

	if (!(("Subject" %in% colnames(inputDataset)) & ("Feature" %in% colnames(inputDataset)) & ("Mean" %in% colnames(inputDataset)) & ("StdDev" %in% colnames(inputDataset)) & ("SampleSize" %in% colnames(inputDataset) & ("Class" %in% colnames(inputDataset)))))  {
		cat("ERROR: unsuitable dataset format. Dataset must contain columns 'Subject', 'Feature', 'Mean', 'StdDev', 'SampleSize', 'Class'.\n")
		return(NULL)
	}

	if (length(inputDataset[1,]) > 6) {
		cat("WARNING: more than 6 dataset columns. Columns other than 'Subject', 'Feature', 'Mean', 'StdDev', 'SampleSize', 'Class' will be ignored.\n")
	}

	availableClasses <- levels(inputDataset$Class)

	# Select subjects based on target list
	listOfTargets <- unique(inputTargetList)
	if (length(listOfTargets) != length(inputTargetList)){
		cat("WARNING: target list presents some duplicates which will be ignored.\n")
	}
	inputTargetList <- unique(inputTargetList)
	listOfTargets <- intersect(listOfTargets, availableClasses)
	if (length(listOfTargets) != length(inputTargetList)){
		cat("WARNING: some of the target classes are not present in the dataset and will be ignored.\n")
	}

	# Select subjects based on versus list
	if (length(inputVersusList) == 0) {
		inputVersusList <- setdiff(availableClasses, listOfTargets)
		inputVersusList <- unique(inputVersusList)
	}
	listOfVersus <- unique(inputVersusList)
	if (length(listOfVersus) != length(inputVersusList)){
		cat("WARNING: versus list presents some duplicates which will be ignored.\n")
	}
	inputVersusList <- unique(inputVersusList)
	listOfVersus <- intersect(listOfVersus, availableClasses)
	if (length(listOfVersus) != length(inputVersusList)){
		cat("WARNING: some of the versus classes are not present in the dataset and will be ignored.\n")
	}

	# Check target and versus list are non-empty and not intersecting
	if (length(listOfTargets)==0 | length(listOfVersus)==0) {
		cat("ERROR: unsuitable function arguments. The requested target and/or versus classes are empty, or not present in the dataset.\n")
		return(NULL)
	} else if (length(intersect(listOfTargets, listOfVersus))) {
		cat("ERROR: conflicting function arguments; target set and versus set have non-empty intersection.\n")
		return(NULL)
	}

	targetVersusFrame <- inputDataset[inputDataset$Class %in% c(listOfTargets,listOfVersus), ]
	subjectsTargetVersus <- unique(targetVersusFrame$Subject)

	# Check for duplicates features in input list
	availableFeatures <- unique(targetVersusFrame$Feature)
	listOfFeature <- unique(inputFeaturesList)
	if (length(listOfFeature) != length(inputFeaturesList)){
		cat("WARNING: features list presents some duplicates which will be ignored.\n")
	}
	inputFeaturesList <- unique(inputFeaturesList)

	# Check for features from the list which are not present in the dataset
	listOfFeature <- intersect(listOfFeature, availableFeatures)
	if (length(listOfFeature) != length(inputFeaturesList)){
		cat("ERROR: some entries of the features list are not present in the dataset.\n")
		return(NULL)
	}

	# Correct for empty separator (default behavior of write.table is sep=" ")
	if (sep == "")
		sep <- " "

	## Select function behavior

	if ((length(inputFeaturesList) == 1) && (length(coeffList) == 0)) {
		normalizer <- inputFeaturesList
		normalizerFlag <- 1
		classifierFlag <- 0
		cat("LOG:\tebbc_bayes() is running in Analysis mode (with normalizer).\n")
	} else if ((length(inputFeaturesList) == 0) && (length(coeffList) == 0)) {
		normalizerFlag <- 0
		classifierFlag <- 0
		cat("LOG:\tebbc_bayes() is running in Analysis mode (without normalizer).\n")
	} else if (length(inputFeaturesList) == length(coeffList)) {
		normalizerFlag <- 0
		classifierFlag <- 1
		cat("LOG:\tebbc_bayes() is running in Training mode.\n")
	} else {
		cat("ERROR: unsuitable input parameters. The list of features and the list of coefficients do not match any valid pattern.\n\tType help(ebbc_bayes) for help.\n")
		return(NULL)
	}

	## Actual computation starts here (loop over features; if classifierFlag==1, stops after first iteration)

	for (currentFeature in availableFeatures) {

		if ((normalizerFlag == 1) && (classifierFlag == 0)) {
			if (currentFeature == normalizer)
				next
			listOfFeature <- c(currentFeature, normalizer)
			listOfFeature <- unique(listOfFeature)
			coeffList <- c(1, -1)
		} else if ((normalizerFlag == 0) && (classifierFlag == 0)) {
			listOfFeature <- currentFeature
			coeffList <- 1
		} else if ((normalizerFlag == 0) && (classifierFlag == 1)) {
			listOfFeature <- inputFeaturesList
		}

		## Pre-Processing: Filter subjects by needed features

		if (exists("subjectsToRemove"))
			rm(subjectsToRemove)

		for (subject in subjectsTargetVersus) {
			subjectFrame <- targetVersusFrame[targetVersusFrame$Subject == subject,]
			availableSubjectFeatures <- unique(subjectFrame$Feature)
			if (length(listOfFeature) != length(intersect(availableSubjectFeatures, listOfFeature))) {
				if (!exists("subjectsToRemove")) {
					subjectsToRemove <- subject
				} else {
					subjectsToRemove <- rbind(subjectsToRemove, subject)
				}
			}
		}
		if (exists("subjectsToRemove")) {
			subjectsToRemove <- unique(subjectsToRemove)
			compliantSubjects <- setdiff(subjectsTargetVersus, subjectsToRemove)
		} else {
			compliantSubjects <- subjectsTargetVersus
		}
		if (length(compliantSubjects) == 0) {
			cat("WARNING: no available subjects for features:\t", listOfFeature, "\n")
			next
		}

		availableDataset <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects, ]
		availableDataset <- availableDataset[availableDataset$Feature %in% listOfFeature, ]

		if (length(availableDataset[availableDataset$Class %in% listOfTargets, 1]) == 0 | length(availableDataset[availableDataset$Class %in% listOfVersus, 1]) == 0) {
			if (classifierFlag) {
				cat("ERROR: No available target and/or versus subjects for this classifier.\n")
				return(NULL)
			} else {
				cat("WARNING: No available target and/or versus subjects for feature '", currentFeature, "', continuing to next feature.\n")
				next
			}
		}

		## Pre-Processing: prepare dataframe in suitable format

		dataFrameTemp <- availableDataset[availableDataset$Class %in% listOfTargets, ]
		if (exists("tempFrame"))
			rm(tempFrame)
		for (feature in listOfFeature) {
			columnSubjectMean <- dataFrameTemp[dataFrameTemp$Feature == feature,]
			columnSubjectMean <- subset(columnSubjectMean, select=c("Subject", "Mean"))

			if (!exists("tempFrame")) {
				tempFrame <- columnSubjectMean
				columnSubjectClass <- dataFrameTemp[dataFrameTemp$Feature == feature,]
				columnSubjectClass <- subset(columnSubjectClass, select=c("Subject", "Class"))
			} else {
				tempFrame <- merge(tempFrame, columnSubjectMean, by = "Subject")
			}

		}
		tempFrame <- merge(tempFrame, columnSubjectClass, by = "Subject")
		completeDataFrame <- cbind(tempFrame, Classification=rep("target", length(tempFrame[,1])))

		dataFrameTemp <- availableDataset[availableDataset$Class %in% listOfVersus, ]
		if (exists("tempFrame"))
			rm(tempFrame)
		for (feature in listOfFeature) {
			columnSubjectMean <- dataFrameTemp[dataFrameTemp$Feature == feature,]
			columnSubjectMean <- subset(columnSubjectMean, select=c("Subject", "Mean"))

			if (!exists("tempFrame")) {
				tempFrame <- columnSubjectMean
				columnSubjectClass <- dataFrameTemp[dataFrameTemp$Feature == feature,]
				columnSubjectClass <- subset(columnSubjectClass, select=c("Subject", "Class"))
			} else
				tempFrame <- merge(tempFrame, columnSubjectMean, by = "Subject")

		}
		tempFrame <- merge(tempFrame, columnSubjectClass, by = "Subject")

		completeDataFrame <- rbind(completeDataFrame, cbind(tempFrame, Classification=rep("versus", length(tempFrame[,1]))))

		names(completeDataFrame) <- c("Subject", listOfFeature, "Class", "Classification")

		## Processing: 1. Compute Score, i.e. linear combination of features

		dataFrameTemp <- completeDataFrame[2:(1+length(listOfFeature))]

		for (feature in listOfFeature)
			dataFrameTemp[,feature] <- dataFrameTemp[,feature] * as.numeric(coeffList[which(listOfFeature == feature)])

		completeDataFrame <- cbind(completeDataFrame, Score=rowSums(dataFrameTemp))
		completeDataFrame <- data.frame(completeDataFrame[with(completeDataFrame, order(Score)),])

		## Processing: 2. Means, StdDevs, NormalityTest

		# Target
		nT <- length(completeDataFrame$Score[completeDataFrame$Classification == "target"])
		xT <- mean(completeDataFrame$Score[completeDataFrame$Classification == "target"])
		sT <- stats::sd(completeDataFrame$Score[completeDataFrame$Classification == "target"])
		dxT <- sT/sqrt(nT)
		dsT <- sT/sqrt(2*(nT-1))
		if (!classifierFlag) {
			if (length(completeDataFrame$Score[completeDataFrame$Classification == "target"]) > 2) {
				res <- stats::shapiro.test(completeDataFrame$Score[completeDataFrame$Classification == "target"])
				shapiroWilkTarget <- (res[[2]])
			} else {
				shapiroWilkTarget <- NA
				cat("WARNING: Feature '", currentFeature, "' has less than 3 Subjects in the target set. Cannot carry out Shapiro-Wilk test.\n")
			}
		}
		# Versus
		nV <- length(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
		xV <- mean(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
		sV <- stats::sd(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
		dxV <- sV/sqrt(nV)
		dsV <- sV/sqrt(2*(nV-1))
		if (!classifierFlag) {
			if (length(completeDataFrame$Score[completeDataFrame$Classification == "versus"]) > 2) {
				res <- stats::shapiro.test(completeDataFrame$Score[completeDataFrame$Classification == "versus"])
				shapiroWilkVersus <- (res[[2]])
			} else {
				shapiroWilkVersus <- NA
				cat("WARNING: Feature '", currentFeature, "' has less than 3 Subjects in the versus set. Cannot carry out Shapiro-Wilk test.\n")
			}
		}
		# Target and Versus
		nTV <- length(completeDataFrame$Score)
		xTV <- mean(completeDataFrame$Score)
		sTV <- stats::sd(completeDataFrame$Score)
		dxTV <- sTV/sqrt(nTV)
		dsTV <- sTV/sqrt(2*(nTV-1))
		if (!classifierFlag) {
			if (length(completeDataFrame$Score) > 2) {
				res <- stats::shapiro.test(completeDataFrame$Score)
				shapiroWilk <- (res[[2]])
			} else {
				shapiroWilkVersus <- NA
				cat("WARNING: Feature '", currentFeature, "' has less than 3 Subjects in the dataset. Cannot carry out Shapiro-Wilk test.\n")
			}
		}
		localPrecision <- 1
		local_muAndS_precision <- localPrecision

		## Processing: 3. Student's t Test between target and versus Scores

		tTestRes <- stats::t.test(completeDataFrame$Score[completeDataFrame$Classification == "target"], completeDataFrame$Score[completeDataFrame$Classification == "versus"], var.equal = FALSE)

		## Processing: 4. d'

		d <- xT - xV
		dd <- sqrt((sT*sT + sV*sV)/2.)
		dee <- d/dd
		dDee <- sqrt(((dxT/dd)**2) + ((dxV/dd)**2) + ((dee/dd/dd/2)**2)*(sT*sT*dsT*dsT + sV*sV*dsV*dsV))

		## Processing: 5. In the case of Classification, compute thresholds

		if (classifierFlag) {

			## Auxiliary function: computes chi, accuracy and the respective errors
			compute_chi_dChi <- function(xT, xV, sT, sV, dxT, dxV, dsT, dsV, ratio) {

				options(warn=-1)
				epsilon <- 1.e-6
				chiPc <- compute_chi_dChi_aux(xT, xV, sT, sV, ratio)

				dchiPc_dxt <- (compute_chi_dChi_aux(xT + epsilon/2., xV, sT, sV, ratio) - compute_chi_dChi_aux(xT - epsilon/2., xV, sT, sV, ratio))/epsilon
				dchiPc_dxv <- (compute_chi_dChi_aux(xT, xV + epsilon/2., sT, sV, ratio) - compute_chi_dChi_aux(xT, xV - epsilon/2., sT, sV, ratio))/epsilon
				dchiPc_dst <- (compute_chi_dChi_aux(xT, xV, sT + epsilon/2., sV, ratio) - compute_chi_dChi_aux(xT, xV, sT - epsilon/2., sV, ratio))/epsilon
				dchiPc_dsv <- (compute_chi_dChi_aux(xT, xV, sT, sV + epsilon/2., ratio) - compute_chi_dChi_aux(xT, xV, sT, sV - epsilon/2., ratio))/epsilon

				dchiPc <- dchiPc_dxt*dchiPc_dxt*dxT*dxT + dchiPc_dxv*dchiPc_dxv*dxV*dxV + dchiPc_dst*dchiPc_dst*dsT*dsT + dchiPc_dsv*dchiPc_dsv*dsV*dsV
				dchiPc <- sqrt(dchiPc)
				options(warn=0)

				return(c(chiPc, dchiPc))
			}

			## Auxiliary function: computes treshold value chi and accuracy (called by compute_chi_dChi)
			compute_chi_dChi_aux <- function(xT, xV, sT, sV, ratio) {

				alpha <- 1/sT/sT - 1/sV/sV
				beta <- xT/sT/sT - xV/sV/sV
				gamma <- xT*xT/sT/sT - xV*xV/sV/sV - 2*log(sV/sT) + 2*log(ratio)

				chi <- as.numeric(polyroot(c(gamma, -2*beta, alpha)))
				pc <- (ratio*stats::pnorm((xT-chi)/sT) + stats::pnorm((chi-xV)/sV))/(ratio + 1)
				return(c(chi,pc))
			}

			# pv/pt = 1
			chi_dChi <- compute_chi_dChi(xT, xV, sT, sV, dxT, dxV, dsT, dsV, 1)
			chi <- chi_dChi[1:2]
			pc <- chi_dChi[3:4]
			dchi <- chi_dChi[5:6]
			dpc <- chi_dChi[7:8]
			dpc <- dpc[which.min(((chi-(xT + xV)/2)**2))]
			pc <- pc[which.min(((chi-(xT + xV)/2)**2))]
			dchi <- dchi[which.min(((chi-(xT + xV)/2)**2))]
			chi <- chi[which.min(((chi-(xT + xV)/2)**2))]

			# pv/pt = likelyhoodRatio: 90% Target threshold
			chi_dChi <- compute_chi_dChi(xT, xV, sT, sV, dxT, dxV, dsT, dsV, 9.)
			chiUp <- chi_dChi[1:2]
			dchiUp <- chi_dChi[5:6]
			dchiUp <- dchiUp[which.min(((chi-(xT + xV)/2)**2))]
			chiUp <- chiUp[which.min(((chi-(xT + xV)/2)**2))]

			# pv/pt = likelyhoodRatio: 90% Versus threshold
			chi_dChi <- compute_chi_dChi(xT, xV, sT, sV, dxT, dxV, dsT, dsV, 1./9.)
			chiDown <- chi_dChi[1:2]
			dchiDown <- chi_dChi[5:6]
			dchiDown <- dchiDown[which.min(((chi-(xT + xV)/2)**2))]
			chiDown <- chiDown[which.min(((chi-(xT + xV)/2)**2))]

			# ROC and corresponding area under curve
			pred <- completeDataFrame$Score
			obs <- completeDataFrame$Classification
			pROC_data <- suppressMessages(pROC::roc(obs, pred))
			ciAuc <- suppressMessages(pROC::ci.auc(pROC_data))

			outputThresholdFrame <- data.frame(chi, dchi, dee, pc, chiUp, dchiUp, chiDown, dchiDown, ciAuc[2], ciAuc[1], ciAuc[3])
			names(outputThresholdFrame) <- c("Threshold", "DeltaThreshold", "DPrime", "Pc", "ChiUp", "DChiUp", "ChiDown", "DChiDown", "AUC", "AUCDown", "AUCUp")
		}

		## Processing: 5b. In the case of feature analysis (not Classification), prepare output

		completeDataFrame <- subset(completeDataFrame, select = c("Subject", "Classification", "Score", listOfFeature))

		if (!classifierFlag) {
			if (normalizerFlag) {
				rowTarget <- c(paste("Δ", currentFeature, sep=""), "target", nT, round(xT,local_muAndS_precision), round(sT,local_muAndS_precision), round(shapiroWilkTarget,2), signif(tTestRes[[3]],2))
				rowVersus <- c(paste("Δ", currentFeature, sep=""), "versus", nV, round(xV,local_muAndS_precision), round(sV,local_muAndS_precision), round(shapiroWilkVersus,2), signif(tTestRes[[3]],2))
			} else {
				rowTarget <- c(currentFeature, "target", nT, round(xT,local_muAndS_precision), round(sT,local_muAndS_precision), round(shapiroWilkTarget,2), signif(tTestRes[[3]],2))
				rowVersus <- c(currentFeature, "versus", nV, round(xV,local_muAndS_precision), round(sV,local_muAndS_precision), round(shapiroWilkVersus,2), signif(tTestRes[[3]],2))
			}

			if (!exists("outputStatisticsFrame")) {
				outputStatisticsFrame <- rowTarget
				outputStatisticsFrame <- rbind(outputStatisticsFrame, rowVersus)
			} else {
				outputStatisticsFrame <- rbind(outputStatisticsFrame, rowTarget)
				outputStatisticsFrame <- rbind(outputStatisticsFrame, rowVersus)
			}
		}

		## Output: filename building

		if (exists("classifierLabel")) {
			rm(classifierLabel)
		}
		for (i in 1:length(coeffList)) {
			if(!exists("classifierLabel"))
				classifierLabel <- paste(coeffList[i], listOfFeature[i], sep="_")
			else
				classifierLabel <- paste(classifierLabel, coeffList[i], listOfFeature[i], sep="_")
		}

		targetLabel <- paste(listOfTargets, sep="", collapse="_")
		versusLabel <- paste(listOfVersus, sep="", collapse="_")
		targetVSVersusLabel <- paste(targetLabel, "vs", versusLabel, sep="_", collapse="")

		## Output: write to file (classifier case)

		if (classifierFlag) {
			if (outputFileBasename == "") {
				thresholdFileName <- paste(targetVSVersusLabel, "u", classifierLabel, sep="_")
				thresholdFileName <- paste(thresholdFileName, "_thresholds", ".txt", sep="")
				scoresFileName <- paste(thresholdFileName, "_scores", ".dat", sep="")
			} else {
 				thresholdFileName <- paste(outputFileBasename, ".txt", sep="")
				scoresFileName <- paste(outputFileBasename, ".dat", sep="")
 			}

			if (file.exists(thresholdFileName) & file.access(thresholdFileName, mode=2)) {
				cat("ERROR:\tcannot write ", thresholdFileName, ". Check write permission.\n")
				return(NULL)
			}

			if (file.exists(scoresFileName) & file.access(scoresFileName, mode=2)) {
				cat("ERROR:\tcannot write ", scoresFileName, ". Check write permission.\n")
				return(NULL)
			}

			cat("#classifier: ", file=thresholdFileName)
			for (i in 1:length(coeffList)) {
				cat(paste(coeffList[i], "*", listOfFeature[i], sep=""), " ", file=thresholdFileName, append = TRUE)
			}
			cat("\n", file=thresholdFileName, append = TRUE)

			options(warn=-1)
			utils::write.table(format(outputThresholdFrame, drop0trailing=FALSE), file=thresholdFileName, append=TRUE, sep=sep, row.names=FALSE, col.names=TRUE, quote=FALSE)
			utils::write.table(format(subset(completeDataFrame, select=c("Subject", "Classification", "Score")), drop0trailing=FALSE), file=scoresFileName, append=TRUE, sep=sep, row.names=FALSE, col.names=TRUE, quote=FALSE)
			options(warn=0)

			cat("LOG:\tThresholds data frame written to ", thresholdFileName, " successfully.\n", sep="")
			cat("LOG:\tScores data frame written to ", scoresFileName, " successfully.\n", sep="")
		}

		## Output: plot (all cases)

		if (outputFileBasename == "") {
			plotFileName <- paste(targetVSVersusLabel, "u", classifierLabel, sep="_")
		} else {
			if (classifierFlag) {
				plotFileName <- outputFileBasename
			} else {
				plotFileName <- paste(outputFileBasename, currentFeature, sep="_")
			}
 		}

		if (classifierFlag) {
			histPlot <- ebbc_plotHistograms(completeDataFrame, outputThresholdFrame, plotFileName, plotFormat=plotFormat)
			cat("LOG:\tHistogram plot saved to ", plotFileName, "_histogram.", plotFormat, " successfully.\n", sep="")
			thresholdPlot <- ebbc_plotThresholds(completeDataFrame, outputThresholdFrame, plotFileName, plotFormat=plotFormat)
			cat("LOG:\tScore classification plot saved to ", plotFileName, "_score.", plotFormat, " successfully.\n", sep="")
			rocPlot <- ebbc_plotROC(completeDataFrame, plotFileName, plotFormat=plotFormat)
			cat("LOG:\tROC plot saved to ", plotFileName, "_ROC.", plotFormat, " successfully.\n", sep="")
		} else {
			histPlot <- ebbc_plotHistograms(completeDataFrame, outputFileLabel=plotFileName, plotFormat=plotFormat)
			cat("LOG:\tHistogram plot saved to ", plotFileName, "_histogram.", plotFormat, " successfully.\n", sep="")
		}

		if (classifierFlag)
			break
	}

	if ((normalizerFlag == 1) && (classifierFlag == 0)) {
		normalizer <- inputFeaturesList
		listOfFeature <- setdiff(availableFeatures, normalizer)
		normalizerFlag <- 1
	} else if ((normalizerFlag == 0) && (classifierFlag == 0)) {
		listOfFeature <- unique(inputDataset$Feature)
	} else if ((normalizerFlag == 0) && (classifierFlag == 1)) {
		listOfFeature <- inputFeaturesList
	}

	## Output: write to file (analysis case)

	if (!classifierFlag) {
		outputStatisticsFrame <- data.frame(outputStatisticsFrame, row.names=NULL)
		names(outputStatisticsFrame) <- c("Feature", "Classification", "NumberOfSubjects", "Mean", "StdDev", "NormalityTest", "t-test")

		if (outputFileBasename == "") {
			if (normalizerFlag) {
				statisticsFileName <- paste(targetVSVersusLabel, "norm", normalizer, "statistics", sep="_")
			} else {
				statisticsFileName <- paste(targetVSVersusLabel, "statistics", sep="_")
			}
			statisticsFileName <- paste(statisticsFileName, ".txt", sep="")
		} else {
 			statisticsFileName <- paste(outputFileBasename, ".txt", sep="")
 		}

		if (file.exists(statisticsFileName) & file.access(statisticsFileName, mode=2)) {
			cat("ERROR: cannot write ", statisticsFileName, ". Check write permission.\n")
			return(NULL)
		}
		options(warn=-1)
		utils::write.table(format(outputStatisticsFrame, drop0trailing=FALSE), file=statisticsFileName, sep=sep, row.names=FALSE, col.names=TRUE, quote=FALSE)
		options(warn=0)

		## Processing: Correlation coefficients

		targetLabel <- paste(listOfTargets, sep="", collapse="_")
		versusLabel <- paste(listOfVersus, sep="", collapse="_")
		targetVSVersusLabel <- paste(targetLabel, "vs", versusLabel, sep="_", collapse="")

		localPrecision <- 2
		if (normalizerFlag) {
			matrixLabels <- paste("\u0394", listOfFeature, sep='')
		} else {
			matrixLabels <- listOfFeature
		}

		cat("\n", append=TRUE, file=statisticsFileName)
		for (classificationLabel in c("target", "versus")) {
			cat("###############################\n\n", append=TRUE, file=statisticsFileName)
			if (classificationLabel=="target")
				cat("## Evaluation on subjects whose classification belongs to the Target set\n", append=TRUE, file=statisticsFileName)
			else
				cat("## Evaluation on subjects whose classification belongs to the Versus set\n", append=TRUE, file=statisticsFileName)
			cat("# Correlation coefficients:\n", append=TRUE, file=statisticsFileName)
			cat("\tr", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
			cat("\n", append=TRUE, file=statisticsFileName)
			for (a in listOfFeature) {
				cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
				subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, ]
				if (classificationLabel == "target") {
					subjectsA <- subjectsA[subjectsA$Class %in% listOfTargets, "Subject"]
				} else if (classificationLabel == "versus") {
					subjectsA <- subjectsA[subjectsA$Class %in% listOfVersus, "Subject"]
				}

				for (b in listOfFeature) {

					if (which(listOfFeature == a) < which(listOfFeature == b)) {
						subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
						if (normalizerFlag) {
							subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
							compliantSubjects <- intersect(subjectsA, subjectsB)
							compliantSubjects <- intersect(compliantSubjects, subjectsN)
							filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
							valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
							valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
						} else {
							compliantSubjects <- intersect(subjectsA, subjectsB)
							filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
							valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
							valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
						}
						if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
							cat(sep=sep, "", round(stats::cor(valuesA, valuesB), localPrecision), append=TRUE, file=statisticsFileName)
						} else {
							cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
							cat("WARNING: Correlation between '", a, "', '", b, "' cannot be computed for the ", classificationLabel, " set: less than 3 values.\n", sep="")
						}
					} else {
						cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
					}
				}
				cat("\n", append=TRUE, file=statisticsFileName)
			}
			cat("\n", append=TRUE, file=statisticsFileName)

			cat(sep="", "# Correlation p-values \"", classificationLabel, "\":\n", append=TRUE, file=statisticsFileName)
			cat("\tp", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
			cat("\n", append=TRUE, file=statisticsFileName)
			for (a in listOfFeature) {
				cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
				subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, ]
				if (classificationLabel == "target") {
					subjectsA <- subjectsA[subjectsA$Class %in% listOfTargets, "Subject"]
				} else if (classificationLabel == "versus") {
					subjectsA <- subjectsA[subjectsA$Class %in% listOfVersus, "Subject"]
				}

				for (b in listOfFeature) {

					if (which(listOfFeature == a) < which(listOfFeature == b)) {
						subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
						if (normalizerFlag) {
							subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
							compliantSubjects <- intersect(subjectsA, subjectsB)
							compliantSubjects <- intersect(compliantSubjects, subjectsN)
							if (length(compliantSubjects) == 0) next
							filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]

							valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
							valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
						} else {
							compliantSubjects <- intersect(subjectsA, subjectsB)
							if (length(compliantSubjects) == 0) next
							filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]

							valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
							valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
						}
						if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
							ctst <- stats::cor.test(valuesA, valuesB)
							cat(sep=sep, "", signif(ctst[[3]], localPrecision), append=TRUE, file=statisticsFileName)
						} else {
							cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
							cat("WARNING: Correlation (p-value) between '", a, "', '", b, "' cannot be computed for the ", classificationLabel, " set: less than 3 values.\n", sep="")
						}
					} else {
						cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
					}
				}
				cat("\n", append=TRUE, file=statisticsFileName)
			}
			cat("\n", append=TRUE, file=statisticsFileName)
		}

		cat("###############################\n\n", append=TRUE, file=statisticsFileName)
		cat("##  Evaluation on subjects whose classification belongs to the union of Target and Versus sets\n", append=TRUE, file=statisticsFileName)
		cat("# Correlation coefficients:\n", append=TRUE, file=statisticsFileName)
		cat("\tr", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
		cat("\n", append=TRUE, file=statisticsFileName)
		for (a in listOfFeature) {
			cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
			subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, "Subject"]
			for (b in listOfFeature) {
				if (which(listOfFeature == a) < which(listOfFeature == b)) {
					subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
					if (normalizerFlag) {
						subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
						compliantSubjects <- intersect(subjectsA, subjectsB)
						compliantSubjects <- intersect(compliantSubjects, subjectsN)
						filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]

						valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
						valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]

					} else {
						compliantSubjects <- intersect(subjectsA, subjectsB)
						filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
						valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
						valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
					}
					if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
						cat(sep=sep, "", round(stats::cor(valuesA, valuesB), localPrecision), append=TRUE, file=statisticsFileName)
					} else {
						cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
						cat("WARNING: Correlation between '", a, "', '", b, "' cannot be computed: less than 3 values.\n", sep="")
					}
				} else {
					cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
				}
			}
			cat("\n", append=TRUE, file=statisticsFileName)
		}
		cat("\n", append=TRUE, file=statisticsFileName)

		cat("# Correlation p-values:\n", append=TRUE, file=statisticsFileName)
		cat("\tp", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
		cat("\n", append=TRUE, file=statisticsFileName)
		for (a in listOfFeature) {
			cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
			subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, "Subject"]
			for (b in listOfFeature) {

				if (which(listOfFeature == a) < which(listOfFeature == b)) {
					subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
					if (normalizerFlag) {
						subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
						compliantSubjects <- intersect(subjectsA, subjectsB)
						compliantSubjects <- intersect(compliantSubjects, subjectsN)
						if (length(compliantSubjects) == 0) next
						filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]

						valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
						valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
					} else {
						compliantSubjects <- intersect(subjectsA, subjectsB)
						if (length(compliantSubjects) == 0) next
						filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]

						valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
						valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
					}
					if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
						ctst <- stats::cor.test(valuesA, valuesB)
						cat(sep=sep, "", signif(ctst[[3]], localPrecision), append=TRUE, file=statisticsFileName)
					} else {
						cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
						cat("WARNING: Correlation (p-value) between '", a, "', '", b, "' cannot be computed: less than 3 values.\n", sep="")
					}
				} else
					cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
			}
			cat("\n", append=TRUE, file=statisticsFileName)
		}
		cat("\n", append=TRUE, file=statisticsFileName)

		cat("# Epsilon:\n", append=TRUE, file=statisticsFileName)
		cat("\teps", paste(matrixLabels, sep=sep), sep=sep, append=TRUE, file=statisticsFileName)
		cat("\n", append=TRUE, file=statisticsFileName)
		for (a in listOfFeature) {
			cat(paste(sep="", "\t", matrixLabels[which(listOfFeature == a)]), append=TRUE, file=statisticsFileName)
			subjectsA <- targetVersusFrame[targetVersusFrame$Feature == a, "Subject"]
			for (b in listOfFeature) {

				if (a != b) {
					subjectsB <- targetVersusFrame[targetVersusFrame$Feature == b, "Subject"]
					if (normalizerFlag) {
						subjectsN <- targetVersusFrame[targetVersusFrame$Feature == normalizer, "Subject"]
						compliantSubjects <- intersect(subjectsA, subjectsB)
						compliantSubjects <- intersect(compliantSubjects, subjectsN)
						filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
						valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
						valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"] - filteredFrame[filteredFrame$Feature == normalizer, "Mean"]
					} else {
						compliantSubjects <- intersect(subjectsA, subjectsB)
						filteredFrame <- targetVersusFrame[targetVersusFrame$Subject %in% compliantSubjects,]
						valuesA <- filteredFrame[filteredFrame$Feature == a, "Mean"]
						valuesB <- filteredFrame[filteredFrame$Feature == b, "Mean"]
					}
					if ((length(valuesA) == length(valuesB)) && (length(valuesA) > 2)) {
						cat(sep=sep, "", round(-stats::cor(valuesA, valuesB)*stats::sd(valuesA)/stats::sd(valuesB), localPrecision), append=TRUE, file=statisticsFileName)
					} else {
						cat(sep=sep, "", NA, append=TRUE, file=statisticsFileName)
						cat("WARNING: Optimized coefficient between '", a, "', '", b, "' cannot be computed: less than 3 values.\n", sep="")
					}
				} else
					cat(sep=sep, "", "-", append=TRUE, file=statisticsFileName)
			}
			cat("\n", append=TRUE, file=statisticsFileName)
		}
		cat("\n", append=TRUE, file=statisticsFileName)

		cat("LOG:\tFeature statistics and correlation matrices written to ", statisticsFileName, " successfully.\n", sep="")
	}

	if (classifierFlag) {
		return(outputThresholdFrame)
	} else {
		return(outputStatisticsFrame)
	}
}
LeonardoRicci/EBBC documentation built on Jan. 24, 2020, 1:25 a.m.