#----------------------------------------------------------------------------------------------------
#' Compute Matthews Correlation Coefficient
#'
#' Given the true classes and predicted classes, compute Matthews Correlation Coefficient
#'
#' @param true_classes A binary vector of classes that indicate the true classification of each
#' point
#' @param predicted_classes A binary vector of classes that indicate the predicted classification
#' of each point
#'
#' @return The Matthews Correlation Coefficient for the given prediction/truth sets
#'
#' @export
mattcc <- function(true_classes, predicted_classes) {
pos <- true_classes == 1
neg <- true_classes == 0
TP <- as.numeric(sum( predicted_classes[pos] == true_classes[pos] ))
TN <- as.numeric(sum( predicted_classes[neg] == true_classes[neg] ))
FP <- as.numeric(sum( predicted_classes[pos] != true_classes[pos] ))
FN <- as.numeric(sum( predicted_classes[neg] != true_classes[neg] ))
N1 <- TP*TN
N2 <- FP*FN
D1 <- TP+FP
D2 <- TP+FN
D3 <- TN+FP
D4 <- TN+FN
N <- N1-N2
D <- D1*D2*D3*D4
if (D==0) {
M <- 0
} else {
M <- N/sqrt(D)
}
return(M)
} #mattcc
#----------------------------------------------------------------------------------------------------
#' Create Matthews Correlation Coefficient Data for Plotting
#'
#' Using the formula for Matthews Correlation Coefficient and a given number of points ranging
#' from 0 to 100, create data for MCC over a set of thresholds between 0 and 1. These data can then
#' be used to plot MCC curves
#'
#' @param true.values The true classifications for the data
#' @param pred.probs The predicted probabilities for the data
#' @param num.points The number of different thresholds to set between 0 and 1 (default = 101)
#'
#' @return A numeric vector of MCC values over the supplied number of points
#'
mattcc.curve <- function(true.values, pred.probs, num.points=101) {
threshholds <- seq(from=0, to=1, length.out=num.points)
thresh.inds <- 1:length(threshholds)
mattcc.data <- sapply(thresh.inds,
function(t) mattcc(true.values,as.numeric(pred.probs>threshholds[t]))
)
return(mattcc.data)
} # mattcc.curve
#----------------------------------------------------------------------------------------------------
#' Create a Data Frame of Predictions for ChIPseq Data Using a Boosted Model
#'
#' Given a boosted model, a set of predictors, and a set of responses, create predicted
#' responses and put them together with the actual responses.
#'
#' @param gbdt.model A gradient boosted model
#' @param X.testdata A matrix of predictors in the testing set
#' @param y.testdata A vector of responses in the testing set
#'
#' @return A data frame containing the model name, the predicted responses, and the actual responses
#' for the test dataset
make.pred.df.from.model <- function(gbdt.model,X.testdata,y.testdata) {
preds.test <- stats::predict(gbdt.model, X.testdata, missing=NA)
pred.df <- data.frame("ChIPseq.bound"=y.testdata,"Prediction"=preds.test)
pred.df$Model.Name = gbdt.model$Model.Name
return(pred.df)
} #make.pred.df.from.model
#----------------------------------------------------------------------------------------------------
#' Create a Data Frame of Predictions for ChIPseq Data Using a Linear Model
#'
#' Given a linear model and a data frame containing labeled predictors of test data and the actual
#' responses as the "ChIPseq.bound" column, predict the responses and return a data frame of the
#' actual responses, the predicted responses, and the model name
#'
#' @param glm.model A generalized linear model that corresponds to the supplied data
#' @param df.testdata A data frame containing test data. It must include columns for all the
#' predictors, plus the actual response as the column "ChIPseq.bound"
#'
#' @return A data frame containing the actual response, the predicted response, and the name of the
#' supplied model
make.pred.df.from.glm <- function(glm.model,df.testdata) {
preds.test <- stats::predict.glm(object=glm.model, newdata=df.testdata, type="response")
pred.df <- data.frame("ChIPseq.bound"=df.testdata$ChIPseq.bound,"Prediction"=preds.test)
pred.df$Model.Name = glm.model$Model.Name
return(pred.df)
} #make.pred.df.from.glm
#----------------------------------------------------------------------------------------------------
#' Create the "Stats" Data Frame from Prediction Data Frame
#'
#' Given the "Prediction" data frame generated by a particular model type, compute the statistics
#' needed to plot the ROC curve, and Precision-Recall curve
#'
#' @param pred.df Data frame containing the actual reponses, predicted responses, and model name for
#' a given model. Generally, this should be output by either \code{make.pred.df.from.model} or
#' \code{make.pred.df.from.glm}
#' @param num.roc.points An integer indicating how many points to include in calculating the ROC
#' curve parameters
#'
#' @return A data frame containing statistics needed to plot the ROC and Precision-Recall curves
make.stats.df.from.preds <- function(pred.df, num.roc.points=101) {
preds <- pred.df$Prediction
cs.vals <- pred.df$ChIPseq.bound
roc.tf <- pROC::roc(cs.vals, preds,
ret=c("threshold", "sens", "spec", "ppv", "npv", "acc"))
stats.tf <- pROC::coords((roc.tf),
seq(from=0, to=1,
length.out=num.roc.points),
ret=c("threshold", "sens", "spec", "ppv", "npv", "acc"))
mattcc.tf <- mattcc.curve(cs.vals, preds, num.points=num.roc.points)
df.stats <- as.data.frame(t(stats.tf))
df.stats$MattCC <- mattcc.tf
df.stats$Model.Name <- pred.df$Model.Name[1]
return(df.stats)
} #make.stats.df.from.preds
#----------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.