R/metricView.R

Defines functions metricView

Documented in metricView

#' Internal function called by \code{\link{runDVboostwrapper}}
#'
#' After model training and testing, \code{metricView} generates summary statistics and figures for users to examine the training performance
#' @param DV.fit.res output generated by \code{\link{fitDVboostmodel}}
#' @param input.sample.ID same sample ID used for annotation procedure
#' @param plot.filename full path to the figure file regarding training performance
#' @param DV.sampleQC.filename full path to the text file that contains metrics for model training performance
#' @param subset a vector of integers indicating a subset of the SVs that all metrics will be evaluated on
#' @return \describe{
#' \item{A figure file}{The figure is saved at \code{plot.filename} and contains 6 subpanels, including \enumerate{\item distribution of known(1) versus novel(0) variants; \item feature importance for model training; \item distribution of Q scores for known and novel variants; \item precision, recell characteristics and best F score; \item ROC curve; \item FDR versus Q score}}
#' \item{QC metrics file}{The .txt file is saved at \code{DV.sampleQC.filename} and contains 6 fields, including \itemize{\item sample.ID: input sample ID \item Fscore_PR: F score for precision&recall \item AUC.PR.known.var: Area under the curve for precision and recall \item AUC.FP.TP.known.var:AUROC \item npos:number of known variants used for training \item nneg:number of novel variants used for training }}
#' }
#'
#' @export
#' @import ggplot2
#' @import ROCR
#' @import gridExtra
#'
metricView <- function(DV.fit.res,input.sample.ID, plot.filename, DV.sampleQC.filename, subset) {
  t1 <- as.data.frame(table(DV.fit.res$is.known.variant[subset]))
  gp1 <-  ggplot(data = t1) + geom_col(aes(x=Var1, y=Freq)) +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x='1 = known variants; 0 = others', y='Frequency', title=input.sample.ID)

  sum.res <- summary.gbm(DV.fit.res, plotit = FALSE, las = 2,
                         main = "Relative influence of DVboost attributes")
  sum.res$var <- factor(sum.res$var,levels = sum.res$var[order(sum.res$rel.inf, decreasing = F)])
  gp2 <- ggplot(data = sum.res) + geom_col(aes(x=var, y=rel.inf , fill=rel.inf)) +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x=NULL, y='Relative influence', title='Relative influence of DVboost attributes') +
    coord_flip() + guides(fill=FALSE)

  df1 <- data.frame(DVboost.Q.score=DV.fit.res$DVboost.Q.score[subset],
                    is.known.variant=DV.fit.res$is.known.variant[subset])
  gp3 <- ggplot(data=df1) + geom_boxplot(aes(x=as.factor(is.known.variant), y=DVboost.Q.score)) +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x="1 = known variants; 0 = others", y='DVboost q-score', title = paste("boxplots of DVboost Q-scores"))

  pred2 <- prediction( (DV.fit.res$DVboost.Q.score[subset]) , DV.fit.res$is.known.variant[subset]==1)
  perf3 <- performance(pred2, "prec", "rec")
  # perf3@x.name
  # [1] "Recall"
  # perf3@y.name
  # [1] "Precision"
  recall.vec <- unlist(perf3@x.values)
  precision.vec <- unlist(perf3@y.values)
  est.AUC.val <- sum(diff(recall.vec)*precision.vec[2:length(precision.vec)])
  df1 <- data.frame(x=recall.vec,
                    y=precision.vec)
  fscore <- 2*recall.vec*precision.vec/(recall.vec + precision.vec)
  idx1 <- which.max(fscore)
  gp4 <- ggplot(data = df1) + geom_line(aes(x=x, y=y)) +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x='Recall', y='Precision', title = paste("Max F score =", round(fscore[idx1],3),' at Q value ',round(perf3@alpha.values[[1]][idx1],2))) +
    geom_point(aes(x=recall.vec[idx1], y=precision.vec[idx1]), shape=4, size=6,color='black')

  perf.FP <- performance(pred2, "fpr", "tpr")
  # perf.FP@x.name
  # [1] "True positive rate"
  # perf.FP@y.name
  # [1] "False positive rate"
  TP.vec <- unlist(perf.FP@x.values)
  FP.vec <- unlist(perf.FP@y.values)
  FP10.index<-which.min(abs(FP.vec-0.1))
  TP.vec.F10<-TP.vec[FP10.index]
  est.AUC.FPTP.val <- sum(diff(FP.vec)*TP.vec[2:length(TP.vec)])
  df1 <- data.frame(x=FP.vec,
                    y=TP.vec)
  gp5 <- ggplot(data = df1) + geom_line(aes(x=x, y=y)) +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x='False positive rate', y='True positive rate', title = paste("AUC =",round(est.AUC.FPTP.val,3)))

  perf.FP1 <- performance(pred2, "prec", "rec")
  FDR.vec <- 1-unlist(perf.FP1@y.values)
  alpha.vec <- unlist(perf.FP1@alpha.values)
  df1 <- data.frame(x=alpha.vec,
                    y=FDR.vec)
  gp6 <- ggplot(data = df1) + geom_line(aes(x=x, y=y)) +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x='Q values', y='False discovery rate', title = "Q value v. FDR")

  ggsave(plot.filename, marrangeGrob(list(gp1,gp2,gp3,gp4,gp5,gp6), ncol=3, nrow=2, layout_matrix = matrix(seq_len(6), nrow = 2, ncol = 3, byrow = T), top = NULL), height = 7, width = 14)
  npos <- length(which(DV.fit.res$is.known.variant[subset]==1))
  nneg <- length(subset) - npos
  DVboost.sample.res <-
    cbind(sample.ID = input.sample.ID,Fscore_PR = fscore[idx1],
          AUC.PR.known.var = est.AUC.val, AUC.FP.TP.known.var = est.AUC.FPTP.val, npos=npos, nneg=nneg)

  write.table(x = DVboost.sample.res, file = DV.sampleQC.filename,
              row.names = FALSE, sep = "\t", quote = FALSE)
}
Liuy12/DVboost documentation built on May 25, 2022, 6:17 a.m.