R/calculate_accuracy.R

Defines functions calculate_accuracy

Documented in calculate_accuracy

#' Accuracy calculation of classification method
#' @description This method provides several classification evaluation metrics
#' to assess the accuracy of predicted pathway classification.
#' The accuracy calculation is performed using predicted pathway
#' activity labels from the employed classification method for each sample and
#' the corresponding true activity labels for the given pathway. A confusion
#' matrix is created to display the classification accuracy decomposed into the
#' distinct pathway activity classes in tabular form for the user. Additional
#' classification evaluation statistics (such as sensitivity, specificity,
#' recall, percentage of classified samples etc) is the optional feature that
#' the user can specify.
#' @author Ozlem Karadeniz \email{ozlem.karadeniz.283@@cranfield.ac.uk}
#' @param true_labels  a data frame, matrix or file name which contains a
#' column named "sample" that consists of sample names / IDs and another column
#' named after a specific pathway which contains the corresponding true pathway
#' activity labels.
#' @param predicted_labels a predicted labels data frame or matrix
#' generated by the classification method yielding predicted pathway activity
#' labels in a column called "class" for the samples in the "sample" column.
#' @param pathway name of pathway used for classification (Note: this pathway
#' name must be present in the true labels data frame / matrix / file name for
#' classification evaluation and generation of the confusion matrix.)
#' @param show_stats an optional flag to display additional statistical
#' information using the confusion matrix and other classification evaluation
#' metrics including: sensitivity, specificity, precision, false positive rate,
#' false negative rate etc.
#' @return confusion_matrix
#' @importFrom utils read.table
#' @export
#'
#' @examples
#' \dontrun{calculate_accuracy(true_labels_df, predicted_labels_df, "ER",
#' show_stats= TRUE)}

calculate_accuracy <-function(true_labels, predicted_labels, pathway,
                              show_stats=FALSE){
  # if type of true_labels parameter is character,then it should be file name.
  # file is checked if it exists and valid
  if(is.character(true_labels)){
    if(!file.exists(true_labels)){
      stop(paste("Error in calculate_accuracy: " , true_labels , " file does not exist!"))
    }
    else{
      true_labels_df <- tryCatch(read.table(true_labels, header=TRUE,sep = "\t"),
                                 error=function(e){ stop(paste("File read error in calculate_accuracy: filename = ", true_labels , "!")) }
      )
    }
  }
  # true_labels is checked if it is data frame or matrix
  # error is thrown if true_labels parameter is not compatible
  else if (is.matrix(true_labels)){
    true_labels_df <- as.data.frame(true_labels)
  }
  else if (is.data.frame(true_labels)){
    true_labels_df <- true_labels
  }
  # otherwise error message is thrown
  else{
    stop("Error in calculate_accuracy: true_labels argument should be of type dataframe/matrix/file name!")
  }

  # predicted_labels is checked if it is data frame or matrix
  # error is thrown if predicted_labels parameter is not compatible
  if (is.matrix(predicted_labels)){
    predicted_labels_df <- as.data.frame(predicted_labels)
  }

  else if (!is.data.frame(predicted_labels)){
    stop("Error in calculate_accuracy: predicted_labels argument should be of type dataframe/matrix!")
  }
  else{
    predicted_labels_df <- predicted_labels
  }

  # Error is thrown if the pathway parameter provided is not in the true labels
  # data frame
  if (!(pathway %in% colnames(true_labels_df))) {
    stop(sprintf("%s pathway labels are not present in true_labels.", pathway))
  } else if (!("sample" %in% colnames(true_labels_df))) {
    stop('Cannot find a column called "sample" in true_labels.')
  }

  # Only labels for the label is kept in  true_labels_df, the rest is removed
  true_labels_df <- true_labels_df[, c(which(colnames(true_labels_df)=="sample"), which(colnames(true_labels_df)==pathway))]

  # Labels Equivocal, [Not Evaluated] and Indeterminate are all set to uncertain
  true_labels_df[,2] <- sapply(true_labels_df[,2],
                               function(x){gsub(pattern = "(Equivocal|\\[Not Evaluated\\]|Indeterminate)",
                                                replacement = "Uncertain", x)})

  # label name is updated to label, in order to have the same for both HER AND ER pathways
  colnames(true_labels_df)[2] <- "label"

  true_labels_df$label= tolower(true_labels_df$label)

  # predicted_labels_df.sample and true_labels_df.CaseID are set to the same format
  predicted_labels_df[,1] <- gsub("\\.", "-", predicted_labels_df[,1])

  # df dataframe created which holds sample name(caseID), actual label(label) and predictedl label(class)
  df <- merge(x=true_labels_df, y=predicted_labels_df, by="sample")

  # statistical data is calculated below to create confusion matrix
  # True Positive, predicted positive, actual positive
  TP <- nrow(df[(df$class == "Active" & df$label == "positive"),])
  # False Positive, predicted positive, actual negative
  FP <- nrow(df[(df$class == "Active" & df$label == "negative"),])
  # True Negative, predicted negative, actual negative
  TN <- nrow(df[(df$class == "Inactive" & df$label == "negative"),])
  # False Negative, predicted negative, actual positive
  FN <- nrow(df[(df$class == "Inactive" & df$label == "positive"),])
  # predicted uncertain
  prdedicted_uncertain = nrow(df[df$class == "Uncertain",])

  # create confusion matrix
  matrix_data <- c(TP, FN, FP, TN)
  confusion_matrix<-matrix(matrix_data,nrow = 2,ncol = 2,
                           dimnames = list(c("Prediction Positive ","Prediction Negative"),
                                           c("Actual Positive","Actual Negative")))

  classified_samples_proportion <- (TP + FN + FP + TN) / sum(TP + FN + FP + TN + prdedicted_uncertain ) * 100
  accuracy_amongst_classified_samples <- (TP + TN) / (TP + FN + FP + TN) * 100

  cat(paste("Confusion Matrix for ", pathway, " pathway\n"))
  cat("--------------------------------------------------------------\n")
  print(confusion_matrix)
  cat("--------------------------------------------------------------\n")

  if(show_stats == 'TRUE'){

    cat("Statistics in Confusion Matrix\n")
    cat("--------------------------------------------------------------\n")
    cat(paste0("Proportion of classified samples: ",
               format(round(classified_samples_proportion, 2), nsmall = 2), "%\n"))
    cat(paste0("Accuracy amongst classified samples: " ,
               format(round(accuracy_amongst_classified_samples, 2), nsmall = 2), "%\n"))
    cat(paste0("True Positive(TP): " , TP, "\n"))
    cat(paste0("True Negative(TN): " , TN, "\n"))
    cat(paste0("False Negative(FN): " , FN, "\n"))
    cat(paste0("False Positive(FP): " , FP, "\n"))
    cat("--------------------------------------------------------------\n")
    cat(paste0("True Positive Rate(TPR)(sensitivity)(Recall): ",
                 format(round(TP / (TP + FN) * 100 , 2) , nsmall =2), "%\n"))
    cat(paste0("True Negative Rate(TNR)(specificity): ",
                 format(round(TN / (TN + FP) * 100 , 2) , nsmall =2), "%\n"))
    cat(paste0("Precision (Positive predictive value): ",
                 format(round(TP / (TP + FP) * 100 , 2) , nsmall =2), "%\n"))
    cat(paste0("False Positive Rate(FPR): ",
                 format(round(FP / (FP + TN) * 100 ,  2) , nsmall =2), "%\n"))
    cat(paste0("False Negative Rate(FNR): ",
                 format(round(FN / (FN + TP) * 100 ,  2) , nsmall =2), "%\n"))
  }

  return(confusion_matrix)
}
a-thind/PathAnalyser documentation built on May 6, 2022, 9:50 a.m.