FPML: Footprint Machine Learning

#----------------------------------------------------------------------------------------------------
#' Compute Matthews Correlation Coefficient
#'
#' Given the true classes and predicted classes, compute Matthews Correlation Coefficient
#'
#' @param true_classes A binary vector of classes that indicate the true classification of each
#' point
#' @param predicted_classes A binary vector of classes that indicate the predicted classification
#' of each point
#'
#' @return The Matthews Correlation Coefficient for the given prediction/truth sets
#'
#' @export

mattcc <- function(true_classes, predicted_classes) {
        
    pos <- true_classes == 1
    neg <- true_classes == 0
    
    TP <- as.numeric(sum( predicted_classes[pos] == true_classes[pos] ))
    TN <- as.numeric(sum( predicted_classes[neg] == true_classes[neg] ))
    FP <- as.numeric(sum( predicted_classes[pos] != true_classes[pos] ))
    FN <- as.numeric(sum( predicted_classes[neg] != true_classes[neg] )) 
    
    N1 <- TP*TN
    N2 <- FP*FN
    D1 <- TP+FP
    D2 <- TP+FN
    D3 <- TN+FP
    D4 <- TN+FN
    
    N <- N1-N2
    D <- D1*D2*D3*D4
    
    if (D==0) {
        M <- 0
    } else {
        M <- N/sqrt(D)
    }
    
    return(M)
} #mattcc
#----------------------------------------------------------------------------------------------------
#' Create Matthews Correlation Coefficient Data for Plotting
#'
#' Using the formula for Matthews Correlation Coefficient and a given number of points ranging
#' from 0 to 100, create data for MCC over a set of thresholds between 0 and 1. These data can then
#' be used to plot MCC curves
#'
#' @param true.values The true classifications for the data
#' @param pred.probs The predicted probabilities for the data
#' @param num.points The number of different thresholds to set between 0 and 1 (default = 101)
#'
#' @return A numeric vector of MCC values over the supplied number of points
#'

mattcc.curve <- function(true.values, pred.probs, num.points=101) {
    threshholds <- seq(from=0, to=1, length.out=num.points)
    thresh.inds <- 1:length(threshholds)

    mattcc.data <- sapply(thresh.inds,
                      function(t) mattcc(true.values,as.numeric(pred.probs>threshholds[t]))
                   )
    return(mattcc.data)

} # mattcc.curve
#----------------------------------------------------------------------------------------------------
#' Create a Data Frame of Predictions for ChIPseq Data Using a Boosted Model
#'
#' Given a boosted model, a set of predictors, and a set of responses, create predicted
#' responses and put them together with the actual responses.
#'
#' @param gbdt.model A gradient boosted model
#' @param X.testdata A matrix of predictors in the testing set
#' @param y.testdata A vector of responses in the testing set
#'
#' @return A data frame containing the model name, the predicted responses, and the actual responses
#' for the test dataset

make.pred.df.from.model <- function(gbdt.model,X.testdata,y.testdata) {
    
    preds.test <- stats::predict(gbdt.model, X.testdata, missing=NA)
    pred.df <- data.frame("ChIPseq.bound"=y.testdata,"Prediction"=preds.test)
    
    pred.df$Model.Name = gbdt.model$Model.Name
    return(pred.df)
    
} #make.pred.df.from.model
#----------------------------------------------------------------------------------------------------
#' Create a Data Frame of Predictions for ChIPseq Data Using a Linear Model
#'
#' Given a linear model and a data frame containing labeled predictors of test data and the actual
#' responses as the "ChIPseq.bound" column, predict the responses and return a data frame of the
#' actual responses, the predicted responses, and the model name
#'
#' @param glm.model A generalized linear model that corresponds to the supplied data
#' @param df.testdata A data frame containing test data. It must include columns for all the
#' predictors, plus the actual response as the column "ChIPseq.bound"
#'
#' @return A data frame containing the actual response, the predicted response, and the name of the
#' supplied model

make.pred.df.from.glm <- function(glm.model,df.testdata) {
    
    preds.test <- stats::predict.glm(object=glm.model, newdata=df.testdata, type="response")
    
    pred.df <- data.frame("ChIPseq.bound"=df.testdata$ChIPseq.bound,"Prediction"=preds.test)
    
    pred.df$Model.Name = glm.model$Model.Name
    return(pred.df)
    
} #make.pred.df.from.glm 
#----------------------------------------------------------------------------------------------------
#' Create the "Stats" Data Frame from Prediction Data Frame
#'
#' Given the "Prediction" data frame generated by a particular model type, compute the statistics
#' needed to plot the ROC curve, and Precision-Recall curve
#'
#' @param pred.df Data frame containing the actual reponses, predicted responses, and model name for
#' a given model. Generally, this should be output by either \code{make.pred.df.from.model} or
#' \code{make.pred.df.from.glm}
#' @param num.roc.points An integer indicating how many points to include in calculating the ROC
#' curve parameters
#'
#' @return A data frame containing statistics needed to plot the ROC and Precision-Recall curves

make.stats.df.from.preds <- function(pred.df, num.roc.points=101) {
    
    preds <- pred.df$Prediction
    cs.vals <- pred.df$ChIPseq.bound

    roc.tf <- pROC::roc(cs.vals, preds,
                        ret=c("threshold", "sens", "spec", "ppv", "npv", "acc"))
    stats.tf <- pROC::coords((roc.tf),
                             seq(from=0, to=1,
                                 length.out=num.roc.points),
                             ret=c("threshold", "sens", "spec", "ppv", "npv", "acc"))
    mattcc.tf <- mattcc.curve(cs.vals, preds, num.points=num.roc.points)
    df.stats <- as.data.frame(t(stats.tf))
    df.stats$MattCC <- mattcc.tf
    df.stats$Model.Name <- pred.df$Model.Name[1]
    
    return(df.stats)
    
} #make.stats.df.from.preds
#----------------------------------------------------------------------------------------------------