#' Accuracy calculation of classification method
#' @description This method provides several classification evaluation metrics
#' to assess the accuracy of predicted pathway classification.
#' The accuracy calculation is performed using predicted pathway
#' activity labels from the employed classification method for each sample and
#' the corresponding true activity labels for the given pathway. A confusion
#' matrix is created to display the classification accuracy decomposed into the
#' distinct pathway activity classes in tabular form for the user. Additional
#' classification evaluation statistics (such as sensitivity, specificity,
#' recall, percentage of classified samples etc) is the optional feature that
#' the user can specify.
#' @author Ozlem Karadeniz \email{ozlem.karadeniz.283@@cranfield.ac.uk}
#' @param true_labels a data frame, matrix or file name which contains a
#' column named "sample" that consists of sample names / IDs and another column
#' named after a specific pathway which contains the corresponding true pathway
#' activity labels.
#' @param predicted_labels a predicted labels data frame or matrix
#' generated by the classification method yielding predicted pathway activity
#' labels in a column called "class" for the samples in the "sample" column.
#' @param pathway name of pathway used for classification (Note: this pathway
#' name must be present in the true labels data frame / matrix / file name for
#' classification evaluation and generation of the confusion matrix.)
#' @param show_stats an optional flag to display additional statistical
#' information using the confusion matrix and other classification evaluation
#' metrics including: sensitivity, specificity, precision, false positive rate,
#' false negative rate etc.
#' @return confusion_matrix
#' @importFrom utils read.table
#' @export
#'
#' @examples
#' \dontrun{calculate_accuracy(true_labels_df, predicted_labels_df, "ER",
#' show_stats= TRUE)}
calculate_accuracy <-function(true_labels, predicted_labels, pathway,
show_stats=FALSE){
# if type of true_labels parameter is character,then it should be file name.
# file is checked if it exists and valid
if(is.character(true_labels)){
if(!file.exists(true_labels)){
stop(paste("Error in calculate_accuracy: " , true_labels , " file does not exist!"))
}
else{
true_labels_df <- tryCatch(read.table(true_labels, header=TRUE,sep = "\t"),
error=function(e){ stop(paste("File read error in calculate_accuracy: filename = ", true_labels , "!")) }
)
}
}
# true_labels is checked if it is data frame or matrix
# error is thrown if true_labels parameter is not compatible
else if (is.matrix(true_labels)){
true_labels_df <- as.data.frame(true_labels)
}
else if (is.data.frame(true_labels)){
true_labels_df <- true_labels
}
# otherwise error message is thrown
else{
stop("Error in calculate_accuracy: true_labels argument should be of type dataframe/matrix/file name!")
}
# predicted_labels is checked if it is data frame or matrix
# error is thrown if predicted_labels parameter is not compatible
if (is.matrix(predicted_labels)){
predicted_labels_df <- as.data.frame(predicted_labels)
}
else if (!is.data.frame(predicted_labels)){
stop("Error in calculate_accuracy: predicted_labels argument should be of type dataframe/matrix!")
}
else{
predicted_labels_df <- predicted_labels
}
# Error is thrown if the pathway parameter provided is not in the true labels
# data frame
if (!(pathway %in% colnames(true_labels_df))) {
stop(sprintf("%s pathway labels are not present in true_labels.", pathway))
} else if (!("sample" %in% colnames(true_labels_df))) {
stop('Cannot find a column called "sample" in true_labels.')
}
# Only labels for the label is kept in true_labels_df, the rest is removed
true_labels_df <- true_labels_df[, c(which(colnames(true_labels_df)=="sample"), which(colnames(true_labels_df)==pathway))]
# Labels Equivocal, [Not Evaluated] and Indeterminate are all set to uncertain
true_labels_df[,2] <- sapply(true_labels_df[,2],
function(x){gsub(pattern = "(Equivocal|\\[Not Evaluated\\]|Indeterminate)",
replacement = "Uncertain", x)})
# label name is updated to label, in order to have the same for both HER AND ER pathways
colnames(true_labels_df)[2] <- "label"
true_labels_df$label= tolower(true_labels_df$label)
# predicted_labels_df.sample and true_labels_df.CaseID are set to the same format
predicted_labels_df[,1] <- gsub("\\.", "-", predicted_labels_df[,1])
# df dataframe created which holds sample name(caseID), actual label(label) and predictedl label(class)
df <- merge(x=true_labels_df, y=predicted_labels_df, by="sample")
# statistical data is calculated below to create confusion matrix
# True Positive, predicted positive, actual positive
TP <- nrow(df[(df$class == "Active" & df$label == "positive"),])
# False Positive, predicted positive, actual negative
FP <- nrow(df[(df$class == "Active" & df$label == "negative"),])
# True Negative, predicted negative, actual negative
TN <- nrow(df[(df$class == "Inactive" & df$label == "negative"),])
# False Negative, predicted negative, actual positive
FN <- nrow(df[(df$class == "Inactive" & df$label == "positive"),])
# predicted uncertain
prdedicted_uncertain = nrow(df[df$class == "Uncertain",])
# create confusion matrix
matrix_data <- c(TP, FN, FP, TN)
confusion_matrix<-matrix(matrix_data,nrow = 2,ncol = 2,
dimnames = list(c("Prediction Positive ","Prediction Negative"),
c("Actual Positive","Actual Negative")))
classified_samples_proportion <- (TP + FN + FP + TN) / sum(TP + FN + FP + TN + prdedicted_uncertain ) * 100
accuracy_amongst_classified_samples <- (TP + TN) / (TP + FN + FP + TN) * 100
cat(paste("Confusion Matrix for ", pathway, " pathway\n"))
cat("--------------------------------------------------------------\n")
print(confusion_matrix)
cat("--------------------------------------------------------------\n")
if(show_stats == 'TRUE'){
cat("Statistics in Confusion Matrix\n")
cat("--------------------------------------------------------------\n")
cat(paste0("Proportion of classified samples: ",
format(round(classified_samples_proportion, 2), nsmall = 2), "%\n"))
cat(paste0("Accuracy amongst classified samples: " ,
format(round(accuracy_amongst_classified_samples, 2), nsmall = 2), "%\n"))
cat(paste0("True Positive(TP): " , TP, "\n"))
cat(paste0("True Negative(TN): " , TN, "\n"))
cat(paste0("False Negative(FN): " , FN, "\n"))
cat(paste0("False Positive(FP): " , FP, "\n"))
cat("--------------------------------------------------------------\n")
cat(paste0("True Positive Rate(TPR)(sensitivity)(Recall): ",
format(round(TP / (TP + FN) * 100 , 2) , nsmall =2), "%\n"))
cat(paste0("True Negative Rate(TNR)(specificity): ",
format(round(TN / (TN + FP) * 100 , 2) , nsmall =2), "%\n"))
cat(paste0("Precision (Positive predictive value): ",
format(round(TP / (TP + FP) * 100 , 2) , nsmall =2), "%\n"))
cat(paste0("False Positive Rate(FPR): ",
format(round(FP / (FP + TN) * 100 , 2) , nsmall =2), "%\n"))
cat(paste0("False Negative Rate(FNR): ",
format(round(FN / (FN + TP) * 100 , 2) , nsmall =2), "%\n"))
}
return(confusion_matrix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.